def calc_gmm(self, dim=10, calc_times=100, force=False): from sklearn.mixture import GaussianMixture logger.info("calculating gmm centers") X = self.mus gmm_path = self.path_to_save_results / "gmm.pkl" if gmm_path.exists(): logger.info(f"loading {gmm_path}") with gmm_path.open("rb") as f: best_gmm = pickle.load(f) best_aic = best_gmm.aic(X) else: best_aic = np.inf pbar = tqdm(range(calc_times)) for i in pbar: gmm = GaussianMixture(dim, covariance_type="full").fit(X) if gmm.aic(X) < best_aic: best_aic = gmm.aic(X) best_gmm = gmm pbar.set_description("[" + "⠸⠴⠦⠇⠋⠙"[i % 6] + "]" + f"{best_aic:.2f}") with gmm_path.open("wb") as f: pickle.dump(best_gmm, f) logger.info(f"best aic : {best_aic}") self.gmm = best_gmm self.aic = best_aic self.gmm_classes = best_gmm.predict(X) self.gmm_centers = best_gmm.means_
def GMM_find_k(data): "Model selection 파트입니다." "Scikit-Learn 라이브러리를 이용해 여러 가지 K (2~10)에 대해 GMM을 수행하고, aic를 이용해 적절한 K를 찾습니다." "군집화함수의 인수중 하나, random_state=0으로 설정해주세요. (grader 때문입니다.)" "Return : 주이진 데이터에 대해 최저의 AIC값을 갖는 number of components K" minimum_AIC = 10000000 for i in range(2, 11): GMM = GaussianMixture(n_components=i, random_state=0).fit(data) if GMM.aic(data) < minimum_AIC: minimum_AIC = GMM.aic(data) min_index = i return min_index
def fitGaussian(ipds, initmean = False, zmw=-1): with np.errstate(invalid='ignore'): ridx = np.where(ipds > 0)[0] lrnn = np.log(ipds[ridx]).reshape(-1,1) gmm1 = GaussianMixture(1, covariance_type='spherical') if initmean: gmm2 = GaussianMixture(2, covariance_type='spherical', means_init=np.array([1.1, 3]).reshape((2,1)), weights_init=np.array([.85, .15]), tol=1e-6) else: gmm2 = GaussianMixture(2, covariance_type='spherical') with warnings.catch_warnings(): warnings.simplefilter("ignore", category=ConvergenceWarning) gmm1.fit(lrnn) if not gmm1.converged_: print('zmw #%d did not converge on gmm1' % (zmw)) gmm2.fit(lrnn) convround = 0 if not gmm2.converged_: gmm2 = GaussianMixture(2, covariance_type='spherical', means_init=np.array([1.1, 3]).reshape((2,1)), n_init=10, init_params='random', tol=1e-5, max_iter=200) gmm2.fit(lrnn) convround = 1 if not gmm2.converged_: gmm2 = GaussianMixture(2, covariance_type='spherical', means_init=np.array([1.1, 3]).reshape((2,1)), n_init=20, init_params='random', tol=1e-4, max_iter=400) gmm2.fit(lrnn) convround = 2 if not gmm2.converged_: gmm2 = GaussianMixture(2, covariance_type='spherical', means_init=np.array([1.1, 3]).reshape((2,1)), n_init=40, init_params='random', tol=1e-3, max_iter=600) gmm2.fit(lrnn) convround = 3 if not gmm2.converged_: convround = 9 print('zmw #%d did not converge on gmm2 even after extensions' % (zmw)) aicdif = gmm1.aic(lrnn) - gmm2.aic(lrnn) mixmeans = gmm2.means_.flatten() mixweights = gmm2.weights_.flatten() elevstate = np.argmax(mixmeans) resp = gmm2.predict_proba(np.log(ipds[ridx].reshape(-1,1))) respfull = np.empty(len(ipds), dtype='float32') respfull.fill(np.nan) respfull[ridx] = resp[:,elevstate] convergInf = str(convround) + '.' + str(gmm2.n_iter_) return (respfull, np.array([mixmeans[1-elevstate], mixmeans[elevstate]]), np.array([mixweights[1-elevstate], mixweights[elevstate]]), aicdif, convergInf)
def GMM(self): data_pull = track_data(self.token, self.seed_track) data = data_pull[0] tracks = data_pull[1] means = [] vars = [] # this block optimizes the number of components n_comps = 36 # starts with 36 components gm_1 = GaussianMixture(n_components=n_comps, random_state=0).fit(data) gm_2 = GaussianMixture(n_components=n_comps + 1, random_state=0).fit(data) if gm_2.aic(data) > gm_1.aic(data): while gm_2.bic(data) > gm_1.bic(data) and n_comps > 1: gm_2 = gm_1 n_comps = n_comps - 1 gm_1 = GaussianMixture(n_components=n_comps, random_state=0).fit(data) else: while gm_2.aic(data) < gm_1.aic(data): gm_1 = gm_2 n_comps = n_comps + 1 gm_2 = GaussianMixture(n_components=n_comps, random_state=0).fit(data) # block that constructs the final analysis of data set gm_out = GaussianMixture(n_components=n_comps, random_state=0).fit(data) for i in range(0, n_comps): vars_i = [] for j in range(0, len(self.stat_names)): for k in range(0, len(self.stat_names)): if k == j: vars_i = np.append(vars_i, gm_out.covariances_[i][j][k]) if i == 0: means = gm_out.means_[i] vars = vars_i else: means = np.vstack((means, gm_out.means_[i])) vars = np.vstack((vars, vars_i)) return [means, vars, n_comps, tracks, data]
def _fit_cluster(data, seed=None): """ Fit a Gaussian Mixture Model to the given data. Parameters ---------- data : array-like, shape=(n_samples, n_features) Data. seed : None or int or RandomState, default=None Initial seed for the RandomState. If seed is None, return the RandomState singleton. If seed is an int, return a RandomState with the seed set to the int. If seed is a RandomState, return that RandomState. Returns ------- model : GaussianMixture The best fitted Gaussian Miture Model as determined by the mean of the BIC and AIC for the respective model. """ data = np.array(data) models = [] abic = [] n_components = min([len(data), 10]) for i in range(n_components): if len(data) < 2 * (i + 1): continue m = GMM(n_components=i + 1, n_init=5, random_state=seed) m.fit(data) models.append(m) abic.append(np.mean([m.bic(data), m.aic(data)])) return models[np.argmin(abic)]
def aic(self): AIC = np.zeros(self.n_components - 1, dtype=float) for n in range(1, self.n_components): clf = GaussianMixture(n_components=n, covariance_type='diag', random_state=0) clf.fit(self.data) AIC[n - 1] = clf.aic(self.data) # print(AIC) aic_min_index = np.where(AIC == np.min(AIC)) aic_k = aic_min_index[0][0] + 1 print("The number of cluster centers AIC choose is :" + str(aic_k)) aic_gmm = GaussianMixture(n_components=aic_k, covariance_type='diag', random_state=0) aic_gmm.fit(self.data) labels = aic_gmm.predict(self.data) for i in range(1, len(labels)): for j in range(aic_k): if labels[i] == j: plt.scatter(self.data[i][0], self.data[i][1], s=15, c=self.color[j]) plt.title('GMM-AIC-' + str(len(labels))) plt.xlabel('x') plt.ylabel('y') # plt.savefig("./Fig/sample_size_aic_" + str(len(labels)) + ".png") # plt.savefig("./Fig/cluster_num_aic_" + str(len(labels)) + ".png") # plt.savefig("./Fig/dimension_num_aic_" + str(len(self.data[0])) + ".png") plt.show()
def model_fitting(data, n): total_obs = len(data) aic = [] bic = [] n_components_range = range(1, n + 1) print('fitting Gaussian Mixture models to data....') for n_components in n_components_range: gmm = GMM(n_components=n_components, covariance_type='full') gmm.fit(data) aic.append(gmm.aic(data)) bic.append(gmm.bic(data)) print('evaluating goodness of fit....') N = optimal_n_components(aic, total_obs) gmm = GMM(n_components=N, covariance_type='full') clf = gmm.fit(data) mus = clf.means_ sigmas = clf.covariances_ weights = clf.weights_ print('Writing model to disk....') model = [mus, sigmas, weights, aic, bic] return model
def lnp_Xw(X_w, x=None, method='gmm', n_comp_max=10, info_crit='bic', njobs=1): ''' Estimate the multi-dimensional pdf at x for a given X_w using a nonparametric density estimation (either KDE or GMM). ''' if x is None: raise ValueError if method not in ['kde', 'gmm']: raise ValueError("method = gkde or gmm") if method == 'gmm': # find best fit component using information criteria (BIC/AIC) gmms, ics = [], [] for i_comp in range(1, n_comp_max + 1): gmm = GMix(n_components=i_comp) gmm.fit(X_w) gmms.append(gmm) if info_crit == 'bic': # Bayesian Information Criterion ics.append(gmm.bic(X_w)) elif info_crit == 'aic': # Akaike information criterion ics.append(gmm.aic(X_w)) ibest = np.array(ics).argmin() # lower the better! kern = gmms[ibest] elif method == 'kde': kern = UT.KayDE(X_w) elif method == 'gkde': # find the best fit bandwidth using cross-validation grid search grid = GridSearchCV(skKDE(), {'bandwidth': np.linspace(0.1, 1.0, 30)}, cv=10, njobs=njobs) # 10-fold cross-validation grid.fit(X_w) kern = grid.best_estimator_ if len(x.shape) == 1: return kern.score_samples(x[:, None]) else: return kern.score_samples(x)
def gmm_eval(data, range_n_clusters=range(2, 16)): aics = [] bics = [] for n_clusters in range_n_clusters: clusterer = GaussianMixture(n_clusters, random_state=42) clusterer = clusterer.fit(data) cluster_labels = clusterer.predict(data) aics.append(clusterer.aic(data)) bics.append(clusterer.bic(data)) distances_1 = [] p1 = Point(initx=np.min(range_n_clusters), inity=aics[0]) p2 = Point(initx=np.max(range_n_clusters), inity=aics[len(range_n_clusters) - 1]) for i in range(0, len(range_n_clusters) - 1): p = Point(initx=i + 1, inity=aics[i]) distances_1.append(p.distance_to_line(p1, p2)) distances_2 = [] p1 = Point(initx=np.min(range_n_clusters), inity=bics[0]) p2 = Point(initx=np.max(range_n_clusters), inity=bics[len(range_n_clusters) - 1]) for i in range(0, len(range_n_clusters) - 1): p = Point(initx=i + 1, inity=bics[i]) distances_2.append(p.distance_to_line(p1, p2)) return aics, bics, distances_1, distances_2
def GMM_RF(X_test, X_train): lowest_bic = np.infty bic = [] # find the original GMM parmeter n_components_range = range(1, 7) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type) gmm.fit(X_all) bic.append(gmm.aic(X_all)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm #print(best_gmm) best_gmm.fit(X_all) X_train = best_gmm.predict_proba(X_train) X_test = best_gmm.predict_proba(X_test) rf = RandomForestClassifier() # find the best parameter of random forest grid_search_rf = GridSearchCV(rf, param_grid=dict(), verbose=3, scoring='accuracy', cv=10).fit(X_train, Y_train) rf_best = grid_search_rf.best_estimator_ rf_best.fit(X_train, Y_train) pred = rf_best.predict(X_test) return pred
def get_best_gmm(X_matrix, n_components, score_df, n_sampling=20, means_init=None): n_points = len(X_matrix) for i in range(n_sampling): gmm = GaussianMixture( n_components=n_components, # reg_covar=0.0000001 # covariance_type='full', means_init=means_init, # #weights_init = [0.1, 0.33, 0.26, 0.1] # init_params='random' ) sample = random.sample(range(n_points), int(0.8 * n_points)) X_rand = X_matrix[sample] # X_rand = X_matrix # np.random.shuffle(X_matrix) gmm.fit(X=X_rand) this_AIC = gmm.aic(X=X_rand) this_BIC = gmm.bic(X=X_rand) score_df.loc[i, "AIC"] = this_AIC score_df.loc[i, "BIC"] = this_BIC if i == 0: best_AIC = this_AIC best_gmm = gmm else: if this_AIC < best_AIC: best_AIC = this_AIC best_gmm = gmm return best_gmm, score_df
def CV_gauss(input_data, index_to_check): X = input_data # ros = RandomOverSampler(random_state=0) # X = ros.fit_sample(X) N, M = X.shape # Range of K's to try KRange = range(1, 8) T = len(KRange) covar_type = 'full' # you can try out 'diag' as well reps = 5 # number of fits with different initalizations, best result will be kept # Allocate variables BIC = np.zeros((T, )) AIC = np.zeros((T, )) CVE = np.zeros((T, )) # K-fold crossvalidation CV = model_selection.KFold(n_splits=10, shuffle=True) for t, K in enumerate(KRange): print('Fitting model for K={0}'.format(K)) # Fit Gaussian mixture model gmm = GaussianMixture(n_components=K, covariance_type=covar_type, n_init=reps).fit(X) BIC[t, ] = gmm.bic(X) AIC[t, ] = gmm.aic(X) # For each crossvalidation fold for train_index, test_index in CV.split(X): # extract training and test set for current CV fold X_train = X[train_index] X_test = X[test_index] # Fit Gaussian mixture model to X_train gmm = GaussianMixture(n_components=K, covariance_type=covar_type, n_init=reps).fit(X_train) # compute negative log likelihood of X_test CVE[t] += -gmm.score_samples(X_test).sum() # Plot results print(CVE) figure(1) plot(KRange, BIC, '-*b') plot(KRange, AIC, '-xr') plot(KRange, 2 * CVE, '-ok') legend(['BIC', 'AIC', 'Crossvalidation']) xlabel('K') show()
def gmm_opt(self, minCluster=5, maxCluster=100, interval=5, testSize=0.25, covarType='full', plot=False): """ Perform Gaussian mixture modeling using test set and increasing cluster number to optimize number of clusters based on AIC/BIC. Parameters ---------- minCluster: int minimum number of clusters to test maxCluster: int maximum number of clusters to test interval: int interval used to jump cluster numbers testSize: float fraction of samples to use in test set covarType: string keyword indicating type of covariance matrix to use Returns ------- max of listAIC, max of listBIC, listAIC, listBIC """ # Initialize lists used to calculate scores per cluster size listAIC = list() listBIC = list() # Split features into training and test sets train, test = train_test_split(self.featMatrix, test_size=testSize) # Perform loop over range of cluster numbers provided for i in range(minCluster, (maxCluster+1), interval): if i % 20 == 0: print("Current cluster number: {}".format(i)) EMmodel = GaussianMixture(n_components=i, covariance_type=covarType, init_params='kmeans') # create GMM object EMmodel.fit(train) # train model on training set # Calculate AIC/BIC for test set listAIC.append(EMmodel.aic(test)) listBIC.append(EMmodel.bic(test)) # Create range of cluster sizes to find max (returns max) x = np.arange(minCluster, (maxCluster+1), interval) # Plot results for both AIC and BIC if plot==True: fig = plt.figure(figsize=(14, 4)) ax1 = fig.add_subplot(121) ax1.set_xlabel("Number of Clusters") ax1.set_ylabel("AIC") ax2 = fig.add_subplot(122) ax2.set_xlabel("Number of Clusters") ax2.set_ylabel("BIC") ax1.plot(x, listAIC) ax2.plot(x, listBIC) return x[np.argmin(listAIC)], x[np.argmin(listBIC)], listAIC, listBIC
def elbow(self, isKM=True): Error = [] rng = self.cluster_range for i in rng: if isKM: km = KMeans(n_clusters=i, n_jobs=-1).fit(self.dataX) Error.append(km.inertia_) else: em = GaussianMixture(n_components=i, init_params='random', random_state=7).fit(self.dataX) Error.append((em.bic(self.dataX), em.aic(self.dataX))) import matplotlib.pyplot as plt if isKM: plt.plot(rng, Error) else: err = pd.DataFrame(Error) plt.plot(rng, err.iloc[:, 0], label='BIC') plt.plot(rng, err.iloc[:, 1], label='AIC') plt.legend(loc="best") clustererType = 'K-Means' if isKM else 'E.M.' ylabel = 'Error' if isKM else 'B.I.C.' plt.title('Elbow Method Analysis for %s on %s' % (clustererType, self.name)) plt.xlabel('No of clusters') plt.ylabel('Error') plt.grid(True) plt.savefig( os.path.join(self.output, self.name + '-' + clustererType + '-elbow.png'))
def gmm(X, Y): print("Running GMM") # Range of k to test krange = np.arange(2, 50) # Opening log file log_path = '../logs/pet_gmm.csv' with open(log_path, 'w') as f: f.write('k,time,ari,homogeneity,completeness,silhouette,aic,bic\n') for k in krange: # Computing GMM start_time = clock() gmm_model = GaussianMixture(k, n_init=10).fit(X) clusters = gmm_model.predict(X) time_taken = clock() - start_time # Computing metrics ari = adjusted_rand_score(Y, clusters) hom = homogeneity_score(Y, clusters) com = completeness_score(Y, clusters) sil = silhouette_score(X, clusters) # Euclidean distance aic = gmm_model.aic(X) bic = gmm_model.bic(X) # Logging metrics with open(log_path, 'a') as f: out = '{},{},{},{},{},{},{},{}\n'.format(k, time_taken, ari, hom, com, sil, aic, bic) f.write(out)
def optimalNbClustersGMM(pc, c_min, c_max, top=2, plot=False): aic = [] bic = [] sil = [] numberOfClusters = range(c_min, c_max) for n in numberOfClusters: model = GaussianMixture(n, covariance_type='full', random_state=0).fit(pc) clusters = model.predict(pc) bic.append(model.bic(pc)) aic.append(model.aic(pc)) sil.append( metrics.silhouette_score(pc, clusters, metric='euclidean', sample_size=None, random_state=None)) if plot: plt.plot(numberOfClusters, bic, label='BIC') plt.plot(numberOfClusters, aic, label='AIC') plt.legend() plt.title('BIC/AIC') plt.xlabel('n_components') plt.figure() plt.plot(numberOfClusters, sil, label='sil') bestBic = np.argsort(bic)[:top] + c_min bestAic = np.argsort(aic)[:top] + c_min bestSil = np.argsort(sil)[::-1][:top] + c_min return bestBic, bestAic, bestSil
def dimReducedClusters(x_data, y_data, x, n): kmeans = KMeans(n_clusters=2) kmeans.fit(x_data) kmeans.predict(x_data) kLabels = kmeans.labels_ if x == 0: nData = pd.DataFrame(x_data) nData['cluster'] = kLabels nNetwork(nData, y_data, n + ': After K-Means') em = GaussianMixture(n_components=2) em.fit(x_data) eLabels = em.predict(x_data) if x == 0: nData = pd.DataFrame(x_data) nData['cluster'] = eLabels nNetwork(nData, y_data, n + ': After EM') a = silhouette_score(x_data, kLabels) b = adjusted_rand_score(y_data, kLabels) c = adjusted_mutual_info_score(y_data, kLabels) d = homogeneity_score(y_data, kLabels) e = completeness_score(y_data, kLabels) f = fowlkes_mallows_score(y_data, kLabels) g = em.bic(x_data) h = em.aic(x_data) return a, b, c, d, e, f, g, h
def plot_gmm_scores(k_range, X_train_transformed, title): bic_scores = [] aic_scores = [] for k in k_range: gmm = GaussianMixture(k, max_iter=500, n_init=10) gmm.fit(X_train_transformed) bic_scores.append(gmm.bic(X_train_transformed)) aic_scores.append(gmm.aic(X_train_transformed)) title_dic = {'fontsize': 7, 'fontweight': 'bold'} fig, (ax1) = plt.subplots(1, 1, figsize=(5, 2)) ax1.set_xlabel("K", title_dic) ax1.set_title(title, title_dic) ax1.set_ylabel("Score", title_dic) ax1.tick_params(axis="x", labelsize=7) ax1.tick_params(axis="y", labelsize=7) ax1.yaxis.set_major_formatter(FormatStrFormatter('%.3f')) ax1.plot(k_range, bic_scores, label="BIC", linewidth=2) ax1.grid() ax1.plot(k_range, aic_scores, label="AIC", linewidth=2) ax1.legend(loc='best', fontsize=6) plt.tight_layout() plt.grid() path = os.path.join(OUTPUT) filename = title + ".png" filename = os.path.join(path, filename) plt.savefig(filename) plt.close()
def bic_model_selection(x, kval, title, filnam, ylabel): plt.clf() cv_types = ['spherical', 'tied', 'diag', 'full'] bic = [] krange = np.arange(1, kval, 1) for cv in cv_types: for k in krange: gm = GaussianMixture(n_components=k, covariance_type=cv) gm.fit(x) if ylabel == "BIC Score": bic.append(gm.bic(x)) elif ylabel == "AIC Score": bic.append(gm.aic(x)) else: bic.append(gm.score(x)) color_iter = itertools.cycle( ['navy', 'turquoise', 'cornflowerblue', 'darkorange']) bars = [] for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)): xpos = np.array(krange) + .2 * (i - 2) bars.append( plt.bar(xpos, bic[i * len(krange):(i + 1) * len(krange)], width=.2, color=color)) plt.title(title) plt.xlabel('Number of components') plt.ylabel(ylabel) plt.legend([b[0] for b in bars], cv_types) plt.savefig(filnam) return
def best_gmm(X, max_range=np.arange(2, 11), covariance_types=None, max_iter=1000, n_init=5, seed=SEED): """ Return the best Gaussian Mixture Model given the data, a range of K values, and two K selection criteria. :param X: usage matrix (made of usage vectors) :param max_range: range within the number of clusters should lie :param covariance_types: a list containing any subset of this list: :param max_iter: maximum number of EM iterations :param n_init: number of EM runs :param seed: random seed :return: best GMM according to Akaike Information Criterion, Bayesian Information Criterion, and the respective AIC and BIC scores """ if covariance_types is None: covariance_types = ['full', 'spherical', 'tied', 'diag'] if not isinstance(covariance_types, (list, )): covariance_types = [covariance_types] aics = defaultdict(list) bics = defaultdict(list) best_gmm_aic = GMM() best_gmm_bic = GMM() for i, cov in enumerate(covariance_types): for k in max_range: m = GaussianMixture(n_components=k, covariance_type=cov, max_iter=max_iter, n_init=n_init, random_state=seed).fit(X) if m.aic(X) < best_gmm_aic.aic(X): best_gmm_aic = GMM(m) if m.bic(X) < best_gmm_bic.bic(X): best_gmm_bic = GMM(m) bics[cov].append(m.bic(X)) aics[cov].append(m.aic(X)) return best_gmm_aic, best_gmm_bic, bics, aics
def run_EM(X,y,title): kdist = list(np.arange(2,100,5)) sil_scores = []; f1_scores = []; homo_scores = []; train_times = []; aic_scores = []; bic_scores = [] for k in kdist: start_time = timeit.default_timer() em = EM(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X) end_time = timeit.default_timer() train_times.append(end_time - start_time) labels = em.predict(X) sil_scores.append(sil_score(X, labels)) y_mode_vote = cluster_predictions(y,labels) f1_scores.append(f1_score(y, y_mode_vote)) homo_scores.append(homogeneity_score(y, labels)) aic_scores.append(em.aic(X)) bic_scores.append(em.bic(X)) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, sil_scores) plt.grid(True) plt.xlabel('# Clusters') plt.ylabel('Silhouette') plt.title(title + ' Exp Max Silhouette') plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, sil_scores) plt.grid(True) plt.xlabel('# Clusters') plt.ylabel('Silhouette') plt.title(title + ' Exp Max Silhouette') plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, f1_scores) plt.grid(True) plt.xlabel('# Clusters') plt.ylabel('F1 Score') plt.title(title + 'Exp Max F1') plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, aic_scores, label='AIC') ax.plot(kdist, bic_scores,label='BIC') plt.grid(True) plt.xlabel('# Clusters') plt.ylabel('Model Complexity Score') plt.title(title + 'Exp Max Model Complexity') plt.legend(loc="best") plt.show()
def run_EM(X_norm,y,title): range_n_clusters = [2,3,4,5,6] silhouette_avg2 = [] homo2 = [] comp2 = [] NMI2 = [] AIC = [] BIC = [] start = time.perf_counter() for index, n_clusters in enumerate(range_n_clusters): # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer2 = GaussianMixture(n_components=n_clusters, random_state=10).fit(X_norm) cluster2_labels = clusterer2.predict(X_norm) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg2.append(silhouette_score(X_norm, cluster2_labels)) homo2.append(metrics.homogeneity_score(y, cluster2_labels)) comp2.append(metrics.completeness_score(y, cluster2_labels)) NMI2.append(normalized_mutual_info_score(y, cluster2_labels)) AIC.append(clusterer2.aic(X_norm)) BIC.append(clusterer2.bic(X_norm)) end = time.perf_counter() print("EM run time: %.1f [s]" % (start-end)) plt.plot(range_n_clusters, silhouette_avg2, label="silhouette") plt.plot(range_n_clusters, homo2, label="homogeneity") plt.plot(range_n_clusters, comp2, label="completeness") plt.plot(range_n_clusters, NMI2, label="NMI") plt.ylabel('value') plt.xlabel('number of cluster') plt.legend(loc="best") plt.title(title) plt.show() plt.plot(range_n_clusters, AIC, label="AIC") plt.plot(range_n_clusters, BIC, label="BIC") plt.ylabel('value') plt.xlabel('number of cluster') plt.legend(loc="best") plt.title(title) plt.show() #visulization of clusters k1 = 4 plt.figure() plt.hist(cluster2_labels, bins=np.arange(0, k1 + 1) - 0.5, rwidth=0.5, zorder=2) plt.xticks(np.arange(0, k1)) plt.xlabel('Cluster label') plt.ylabel('Number of samples') plt.title(title) plt.show()
def em_sweep_clusters(clusters, dataset, data, data_labels, dim_red=None): if dim_red is None: file = './results/em_clusters_' + dataset + '.csv' else: file = './results/' + dim_red + '_em_clusters_' + dataset + '.csv' if dim_red is not None: comp_count = best_comp_count(dataset, dim_red) if dim_red is "PCA": dim_red = PCA(n_components=comp_count, random_state=0) transformed_data = dim_red.fit_transform(data) if dim_red is "ICA": dim_red = FastICA(n_components=comp_count, random_state=0) transformed_data = dim_red.fit_transform(data) if dim_red is "RP": dim_red = SparseRandomProjection(n_components=comp_count, random_state=0) transformed_data = dim_red.fit_transform(data) if dim_red is "RF": dim_red = RandomForestClassifier(n_estimators=comp_count, random_state=0, n_jobs=-1).fit(data, data_labels) transformed_data = selectKImportance(dim_red, data, comp_count) else: transformed_data = data print("Transformed data. Orig shape: ", data.shape, " new shape: ", transformed_data.shape) with open(file, 'w') as f: f.write('{},{},{},{},{},{},{}\n'.format("n_components", "score", "bic", "aic", "norm_mutual_info", "purity", "fit_time")) for cluster in clusters: if dim_red is not None and comp_count < cluster: continue start = time.time() em = GaussianMixture(n_components=cluster, random_state=0).fit(transformed_data) end = time.time() elapsed = end - start print("Fit clusters of: ", cluster, " on ", dataset, " in ", elapsed) aic = em.aic(transformed_data) print("For clusters of: ", cluster, " on ", dataset, " data set, got aic of: ", aic) bic = em.bic(transformed_data) print("For clusters of: ", cluster, " on ", dataset, " data set, got bic of: ", bic) score = em.score(transformed_data) print("For clusters of ", cluster, " on ", dataset, " data set, got score of :", score) data_cluster_labels = em.predict(transformed_data) nmi = normalized_mutual_info_score(data_labels, data_cluster_labels) purity = purity_score(data_labels, data_cluster_labels) print("\tValidation: got nmi of:", nmi, " and purity of: ", purity) with open(file, 'a') as f: f.write('{},{},{},{},{},{},{}\n'.format(cluster, score, bic, aic, nmi, purity, elapsed)) return
def AIC_selection(self): for n_comp in range(1, self.n_components + 1): GMM = GaussianMixture(n_components=n_comp, max_iter=10000) GMM.fit(self.data) self.AIC_score.append(GMM.aic(self.data)) self.bestAIC_k = np.argmin(self.AIC_score) + 1 print("Best k by AIC: %d" % (self.bestAIC_k)) self.model = GaussianMixture(n_components=self.bestAIC_k, max_iter=10000) self.model.fit(self.data)
def aicMethod(self, data): aics = [] for n_clusters in tqdm(self.cluster_range): gmm = GaussianMixture(n_components=n_clusters, covariance_type='full') gmm.fit(data) aics.append(gmm.aic(data)) print(aics) return self.cluster_range[aics.index(min(aics))]
def gaussian_parameter_search(df, n_components, cov_type="full"): AIC = {} BIC = {} if cov_type == "full": for n in n_components: gmm = GaussianMixture(n, covariance_type="full", max_iter=1000, n_init=25, random_state=42).fit(df) AIC[n] = gmm.aic(df) BIC[n] = gmm.bic(df) elif cov_type == "tied": for n in n_components: gmm = GaussianMixture(n, covariance_type="tied", max_iter=1000, n_init=25, random_state=42).fit(df) AIC[n] = gmm.aic(df) BIC[n] = gmm.bic(df) elif cov_type == "diag": for n in n_components: gmm = GaussianMixture(n, covariance_type="diag", max_iter=1000, n_init=25, random_state=42).fit(df) AIC[n] = gmm.aic(df) BIC[n] = gmm.bic(df) elif cov_type == "spherical": for n in n_components: gmm = GaussianMixture(n, covariance_type="spherical", max_iter=1000, n_init=25, random_state=42).fit(df) AIC[n] = gmm.aic(df) BIC[n] = gmm.bic(df) return AIC, BIC
def fit_model(X, n_init=50): aic = [] lowest_aic = np.infty for n in range(1, 10): mog = GaussianMixture(n_components=n, n_init=n_init) mog.fit(X) aic.append(mog.aic(X)) if aic[-1] < lowest_aic: lowest_aic = aic[-1] best_mog = mog return best_mog
def compute_em_elbow_curves(): plt.figure() processor.latext_start_figure() for dataset in datasets: dataset_name = dataset.__class__.__name__ print('%s' % dataset_name) X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans') distortions = [] clusters = [] times = [] iterations = [] silhouette_coefficients = [] aics = [] for x in range(2, 11): i = int(x) print('# of clusters: %i' % i) km = GaussianMixture(n_components=i, n_init=10, max_iter=600, random_state=0, tol=0.0001) try: t0 = time() km.fit(X_train) times.append(round(time() - t0, 6)) print('Converged:', km.converged_) # Check if the model has converged means = km.means_ covariances = km.covariances_ aics.append(km.aic(X_train)) distortions_score = km.score(X=X_train) distortions.append(1.0 / distortions_score) labels = km.predict(X=X_train) score = silhouette_score(X_train, labels) clusters.append(i) iterations.append(km.n_iter_) silhouette_coefficients.append(score) except Exception as e: pass draw_plot(clusters, distortions, 'Distortion', dataset_name, "em") draw_plot(clusters, aics, 'AIC', dataset_name, "em") draw_plot(clusters, times, 'Training Time', dataset_name, "em") draw_plot(clusters, iterations, 'Iterations', dataset_name, "em") draw_plot(clusters, silhouette_coefficients, 'Silhouette Coefficient', dataset_name, "em") kl = KneeLocator(clusters, distortions, curve="convex", direction="decreasing") print(kl.elbow) processor.latex_end_figure(caption="Cluster Validation", fig="cluster_curve")
def AIC_extraction(self, X, n_components=10, covariance_type='full'): # GMMで学習したモデルをAkaike's Information Criterionで評価 AIC = [] for n in range(n_components): gm = GaussianMixture(n_components=n + 1, covariance_type=covariance_type, random_state=self._random_state) gm.fit(X) AIC.append(gm.aic(X)) return AIC
def aic_select(self): self.aic_b = True minaic = 9999 for n in range(1, self.n_components + 1): gmm = GaussianMixture(n_components=n) gmm.fit(self.data) self.aic.append(gmm.aic(self.data)) if self.aic[-1] < minaic: minaic = self.aic[-1] self.model = deepcopy(gmm) print("aic\n", self.aic) self.res_n = self.aic.index(minaic) + 1 print("selected components:", self.res_n, '\n')
def gmm_analysis(self, X_train, X_test, y_train, y_test, data_set_name, max_clusters, analysis_name='GMM'): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) em_bic = [] em_aic = [] em_completeness_score = [] em_homogeneity_score = [] em_measure_score = [] em_adjusted_rand_score = [] em_adjusted_mutual_info_score = [] cluster_range = np.arange(2, max_clusters+1, 1) for k in cluster_range: print('K Clusters: ', k) ## ## Expectation Maximization ## em = GaussianMixture(n_components=k, covariance_type='full') em.fit(X_train_scl) em_pred = em.predict(X_train_scl) em_bic.append(em.bic(X_train_scl)) em_aic.append(em.aic(X_train_scl)) # metrics y_train_score = y_train.reshape(y_train.shape[0],) em_homogeneity_score.append(homogeneity_score(y_train_score, em_pred)) em_completeness_score.append(completeness_score(y_train_score, em_pred)) em_measure_score.append(v_measure_score(y_train_score, em_pred)) em_adjusted_rand_score.append(adjusted_rand_score(y_train_score, em_pred)) em_adjusted_mutual_info_score.append(adjusted_mutual_info_score(y_train_score, em_pred)) ## ## Plots ## ph = plot_helper() ## ## BIC/AIC Plot ## title = 'Information Criterion Plot (' + analysis_name + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_ic' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(cluster_range, [em_bic, em_aic], [None, None], ['bic', 'aic'], cm.viridis(np.linspace(0, 1, 2)), ['o', '*'], title, 'Number of Clusters', 'Information Criterion', filename) ## ## Score Plot ## title = 'Score Summary Plot (' + analysis_name + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_score' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(cluster_range, [em_homogeneity_score, em_completeness_score, em_measure_score, em_adjusted_rand_score, em_adjusted_mutual_info_score], [None, None, None, None, None, None], ['homogeneity', 'completeness', 'measure', 'adjusted_rand', 'adjusted_mutual_info'], cm.viridis(np.linspace(0, 1, 5)), ['o', '^', 'v', '>', '<', '1'], title, 'Number of Clusters', 'Score', filename)
w = np.exp(-np.exp(3 * w.mean(axis=1))) # gmm model selection with aic: lowest_aic = np.infty aic = [] n_components_range = range(1, 7) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a mixture of Gaussians with EM gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type, n_init=5) gmm.fit(X) aic.append(gmm.aic(X)) if aic[-1] < lowest_aic: lowest_aic = aic[-1] best_gmm = gmm preds = best_gmm.predict(X) probs = best_gmm.predict_proba(X) for name, col in zip(cv_types, np.array(aic).reshape(-1, len(cv_types)).T): plt.plot(n_components_range, col, label=name) plt.legend() plt.savefig('gmm_sklearn_aic/aic.pdf') data_thr['preds'] = pd.Series(preds).astype("category")