def fit_mixtures(X,mag,mbins,binwidth=0.2,seed=None, keepscore=False,keepbic=False,**kwargs): kwargs.setdefault('n_components',25) kwargs.setdefault('covariance_type','full') fits = [] if keepscore: scores = [] if keepbic: bics = [] if seed: np.random.seed(seed) for bincenter in mbins: # this is not an efficient way to assign bins, but the time # is negligible compared to the GMM fitting anyway ii = np.where( np.abs(mag-bincenter) < binwidth )[0] if False: print('{:.2f}: {} qsos'.format(bincenter,len(ii))) gmm = GaussianMixture(**kwargs) gmm.fit(X[ii]) fits.append(gmm) if keepscore: scores.append(gmm.score(X[ii])) if keepbic: bics.append(gmm.bic(X[ii])) rv = (fits,) if keepscore: rv += (scores,) if keepbic: rv += (bics,) return rv
X_train = datatrans[train_index] y_train = np.array(target[train_index]) X_test = datatrans[test_index] y_test = target[test_index] #find the best mixture number using bic with different covariances type optimal = [] for s in range( 2, 6 ): #more number here lead to more traffic because the data size is little big for cov_type in ['spherical', 'diag', 'tied', 'full']: models = GaussianMixture(n_components=s, covariance_type=cov_type, max_iter=150, n_init=20, random_state=500).fit(X_train) bic = models.bic(datatrans) # bic number optimal.append([bic, cov_type, s]) # you can here plot this loop using line chart (but it will make our code more slow) final_components = min(optimal) #lower bic lead to best fis n_classes = final_components[2] reg_ = [0.0001, 0.008, 0.05, 0.1, 0.2, 0.3, 0.5] #useing different size to control covariance param_ = ['random', 'kmeans'] # Try GMMs using different types of covariances. for u in (reg_): for param in (param_): estimators = { cov_type: GaussianMixture(n_components=n_classes,
Cancer_EM_bic = [] Cancer_EM_score = [] Cancer_EM_homogeneity_score = [] Cancer_EM_complete_score = [] Cancer_EM_log = [] Cancer_EM_train_acc = [] Cancer_EM_cv_acc = [] for i in n_components: print(i) EM.set_params(random_state=7641,n_components=i) EM.fit(Cancer_X) Cancer_EM_score.append(EM.score(Cancer_X_train)) Cancer_EM_bic.append(EM.bic(Cancer_X_train)) Cancer_EM_aic.append(EM.aic(Cancer_X_train)) Cancer_EM_log.append(silhouette_score(Cancer_X_train,EM.predict(Cancer_X_train))) Cancer_EM_homogeneity_score.append(homogeneity_score(Cancer_y_train,EM.predict(Cancer_X_train))) Cancer_EM_complete_score.append(completeness_score(Cancer_y_train,EM.predict(Cancer_X_train))) Cancer_scores = cross_validate(EM, Cancer_X_train, Cancer_y_train, cv=5, scoring=make_scorer(my_custom_acc, greater_is_better=True), n_jobs=-1, return_train_score=True) Cancer_EM_train_acc.append(np.mean(Cancer_scores['train_score'])) Cancer_EM_cv_acc.append(np.mean(Cancer_scores['test_score'])) PlotEm(6,n_components,Cancer_EM_aic,'AIC','Cancer') PlotEm(7,n_components,Cancer_EM_bic,'BIC','Cancer') PlotEm(8,n_components,Cancer_EM_score,'SSE','Cancer') PlotEm(9,n_components,Cancer_EM_log,'Log-Likelihood','Cancer') PlotEm(10,n_components,Cancer_EM_homogeneity_score,'homogeneity_score','Cancer') PlotEm(11,n_components,Cancer_EM_complete_score,'complete_score','Cancer')
# K-fold crossvalidation CV = model_selection.KFold(n_splits=10, shuffle=True) for t, K in enumerate(KRange): print('Fitting model for K={0}'.format(K)) # Fit Gaussian mixture model gmm = GaussianMixture(n_components=K, covariance_type=covar_type, n_init=reps, init_params=init_procedure, tol=1e-6, reg_covar=1e-6).fit(X) # Get BIC and AIC BIC[t, ] = gmm.bic(X) AIC[t, ] = gmm.aic(X) # For each crossvalidation fold for train_index, test_index in CV.split(X): # extract training and test set for current CV fold X_train = X[train_index] X_test = X[test_index] # Fit Gaussian mixture model to X_train gmm = GaussianMixture(n_components=K, covariance_type=covar_type, n_init=reps).fit(X_train) # compute negative log likelihood of X_test
fn = open(args.outroot + '.nzgmm.json', 'w') for sel in sels: print sel # bic, lowest_bic = [], np.infty n_components_range = range(1, args.ngauss) # z = data['zphot'][data['is' + sel]] print sel, len(z), 'obj.' # gmm X = z.reshape((len(z), 1)) for n_components in range(1, args.ngauss): gmm = GaussianMixture(n_components=n_components, covariance_type='diag') gmm.fit(X) bic.append(gmm.bic(X)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm bic = np.array(bic) clf = best_gmm ## rounding ps = np.round(clf.weights_, 3).tolist() mus = np.round(clf.means_.flatten(), 3).tolist() sds = np.round(np.sqrt(clf.covariances_.flatten()), 3).tolist() mydict[sel] = {} mydict[sel]['p'], mydict[sel]['mu'], mydict[sel]['sd'] = ps, mus, sds ## plotting fig, ax = plt.subplots() nzraw = np.array([((z >= zgrid[i]) & (z < zgrid[i + 1])).sum() for i in range(nbins)])
#Put numbers back to original shape so we can reconstruct segmented image original_shape = img.shape segmented = gmm_labels.reshape(original_shape[0], original_shape[1]) plt.imshow(segmented) #cv2.imwrite("images/segmented.jpg", segmented) ############################################################## #How to know the best number of components? #Using Bayesian information criterion (BIC) to find the best number of components import numpy as np import cv2 img = cv2.imread("images/BSE.tif") img2 = img.reshape((-1, 3)) from sklearn.mixture import GaussianMixture as GMM n = 4 gmm_model = GMM(n, covariance_type='tied').fit(img2) #The above line generates GMM model for n=2 #Now let us call the bic method (or aic if you want). bic_value = gmm_model.bic( img2) #Remember to call the same model name from above) print(bic_value) #You should see bic for GMM model generated using n=2. #Do this exercise for different n values and plot them to find the minimum. #Now, to explain m.bic, here are the lines I used in the video. n_components = np.arange(1, 10) gmm_models = [GMM(n, covariance_type='tied').fit(img2) for n in n_components] plt.plot(n_components, [m.bic(img2) for m in gmm_models], label='BIC')
def run_EM(X,y,title): #kdist = [2,3,4,53 #kdist = list(range(2,51)) kdist = list(np.arange(2,150,5)) sil_scores = []; f1_scores = []; homo_scores = []; train_times = []; aic_scores = []; bic_scores = [] for k in kdist: start_time = timeit.default_timer() em = EM(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X) end_time = timeit.default_timer() train_times.append(end_time - start_time) labels = em.predict(X) sil_scores.append(sil_score(X, labels)) y_mode_vote = cluster_predictions(y,labels) # f1_scores.append(f1_score(y, y_mode_vote)) homo_scores.append(homogeneity_score(y, labels)) aic_scores.append(em.aic(X)) bic_scores.append(em.bic(X)) # elbow curve for silhouette score fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, sil_scores) plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('Avg Silhouette Score') plt.title('Elbow Plot for EM: '+ title) plt.show() # plot homogeneity scores fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, homo_scores) plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('Homogeneity Score') plt.title('Homogeneity Scores EM: '+ title) plt.show() # plot f1 scores # fig = plt.figure() # ax = fig.add_subplot(111) # ax.plot(kdist, f1_scores) # plt.grid(True) # plt.xlabel('No. Distributions') # plt.ylabel('F1 Score') # plt.title('F1 Scores EM: '+ title) # plt.show() # plot model AIC and BIC fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, aic_scores, label='AIC') ax.plot(kdist, bic_scores,label='BIC') plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('Model Complexity Score') plt.title('EM Model Complexity: '+ title) plt.legend(loc="best") plt.show()
def gmm_analysis(self, X_train, X_test, y_train, y_test, data_set_name, max_clusters, analysis_name='GMM'): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) em_bic = [] em_aic = [] em_completeness_score = [] em_homogeneity_score = [] em_measure_score = [] em_adjusted_rand_score = [] em_adjusted_mutual_info_score = [] cluster_range = np.arange(2, max_clusters+1, 1) for k in cluster_range: print('K Clusters: ', k) ## ## Expectation Maximization ## em = GaussianMixture(n_components=k, covariance_type='full') em.fit(X_train_scl) em_pred = em.predict(X_train_scl) em_bic.append(em.bic(X_train_scl)) em_aic.append(em.aic(X_train_scl)) # metrics y_train_score = y_train.reshape(y_train.shape[0],) em_homogeneity_score.append(homogeneity_score(y_train_score, em_pred)) em_completeness_score.append(completeness_score(y_train_score, em_pred)) em_measure_score.append(v_measure_score(y_train_score, em_pred)) em_adjusted_rand_score.append(adjusted_rand_score(y_train_score, em_pred)) em_adjusted_mutual_info_score.append(adjusted_mutual_info_score(y_train_score, em_pred)) ## ## Plots ## ph = plot_helper() ## ## BIC/AIC Plot ## title = 'Information Criterion Plot (' + analysis_name + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_ic' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(cluster_range, [em_bic, em_aic], [None, None], ['bic', 'aic'], cm.viridis(np.linspace(0, 1, 2)), ['o', '*'], title, 'Number of Clusters', 'Information Criterion', filename) ## ## Score Plot ## title = 'Score Summary Plot (' + analysis_name + ') for ' + data_set_name name = data_set_name.lower() + '_' + analysis_name.lower() + '_score' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(cluster_range, [em_homogeneity_score, em_completeness_score, em_measure_score, em_adjusted_rand_score, em_adjusted_mutual_info_score], [None, None, None, None, None, None], ['homogeneity', 'completeness', 'measure', 'adjusted_rand', 'adjusted_mutual_info'], cm.viridis(np.linspace(0, 1, 5)), ['o', '^', 'v', '>', '<', '1'], title, 'Number of Clusters', 'Score', filename)
def __do_perform(self, custom_out=None, main_experiment=None): if custom_out is not None: # if not os.path.exists(custom_out): # os.makedirs(custom_out) self._old_out = self._out self._out = custom_out elif self._old_out is not None: self._out = self._old_out if main_experiment is not None: self.log("Performing {} as part of {}".format( self.experiment_name(), main_experiment.experiment_name())) else: self.log("Performing {}".format(self.experiment_name())) # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py # %% Data for 1-3 sse = defaultdict(list) ll = defaultdict(list) bic = defaultdict(list) sil = defaultdict(lambda: defaultdict(list)) sil_s = np.empty(shape=(2 * len(self._clusters) * self._details.ds.training_x.shape[0], 4), dtype='<U21') acc = defaultdict(lambda: defaultdict(float)) adj_mi = defaultdict(lambda: defaultdict(float)) km = kmeans(random_state=self._details.seed) gmm = GMM(random_state=self._details.seed) st = clock() j = 0 for k in self._clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(self._details.ds.training_x) gmm.fit(self._details.ds.training_x) km_labels = km.predict(self._details.ds.training_x) gmm_labels = gmm.predict(self._details.ds.training_x) sil[k]['Kmeans'] = sil_score(self._details.ds.training_x, km_labels) sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels) km_sil_samples = sil_samples(self._details.ds.training_x, km_labels) gmm_sil_samples = sil_samples(self._details.ds.training_x, gmm_labels) # There has got to be a better way to do this, but I can't brain right now for i, x in enumerate(km_sil_samples): sil_s[j] = [k, 'Kmeans', round(x, 6), km_labels[i]] j += 1 for i, x in enumerate(gmm_sil_samples): sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]] j += 1 sse[k] = [km.score(self._details.ds.training_x)] ll[k] = [gmm.score(self._details.ds.training_x)] bic[k] = [gmm.bic(self._details.ds.training_x)] acc[k]['Kmeans'] = cluster_acc(self._details.ds.training_y, km_labels) acc[k]['GMM'] = cluster_acc(self._details.ds.training_y, gmm_labels) adj_mi[k]['Kmeans'] = ami(self._details.ds.training_y, km_labels) adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels) self.log("Cluster: {}, time: {}".format(k, clock() - st)) sse = (-pd.DataFrame(sse)).T sse.index.name = 'k' sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name)] ll = pd.DataFrame(ll).T ll.index.name = 'k' ll.columns = [ '{} log-likelihood'.format(self._details.ds_readable_name) ] bic = pd.DataFrame(bic).T bic.index.name = 'k' bic.columns = ['{} BIC'.format(self._details.ds_readable_name)] sil = pd.DataFrame(sil).T sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score', 'label']).set_index('k') #.T # sil_s = sil_s.T acc = pd.DataFrame(acc).T adj_mi = pd.DataFrame(adj_mi).T sil.index.name = 'k' sil_s.index.name = 'k' acc.index.name = 'k' adj_mi.index.name = 'k' sse.to_csv(self._out.format('{}_sse.csv'.format( self._details.ds_name))) ll.to_csv( self._out.format('{}_logliklihood.csv'.format( self._details.ds_name))) bic.to_csv(self._out.format('{}_bic.csv'.format( self._details.ds_name))) sil.to_csv( self._out.format('{}_sil_score.csv'.format(self._details.ds_name))) sil_s.to_csv( self._out.format('{}_sil_samples.csv'.format( self._details.ds_name))) acc.to_csv(self._out.format('{}_acc.csv'.format( self._details.ds_name))) adj_mi.to_csv( self._out.format('{}_adj_mi.csv'.format(self._details.ds_name))) # %% NN fit data (2,3) grid = { 'km__n_clusters': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) km = kmeans(random_state=self._details.seed, n_jobs=self._details.threads) pipe = Pipeline([('km', km), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator(pipe, grid, type='kmeans') self.log("KMmeans Grid search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_cluster_kmeans.csv'.format( self._details.ds_name))) grid = { 'gmm__n_components': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) gmm = CustomGMM(random_state=self._details.seed) pipe = Pipeline([('gmm', gmm), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator(pipe, grid, type='gmm') self.log("GMM search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_cluster_GMM.csv'.format( self._details.ds_name))) # %% For chart 4/5 self._details.ds.training_x2D = TSNE( verbose=10, random_state=self._details.seed).fit_transform( self._details.ds.training_x) ds_2d = pd.DataFrame(np.hstack( (self._details.ds.training_x2D, np.atleast_2d(self._details.ds.training_y).T)), columns=['x', 'y', 'target']) ds_2d.to_csv( self._out.format('{}_2D.csv'.format(self._details.ds_name))) self.log("Done")
def _fit_cluster(self, X, y, params): label_init = self.label_init if label_init is not None: onehot = _labels_to_onehot(label_init) weights_init, means_init, precisions_init = _onehot_to_initial_params( X, onehot, params[1]["covariance_type"]) gm_params = params[1] gm_params["weights_init"] = weights_init gm_params["means_init"] = means_init gm_params["precisions_init"] = precisions_init elif params[0]["affinity"] != "none": agg = AgglomerativeClustering(**params[0]) n = X.shape[0] if self.max_agglom_size is None or n <= self.max_agglom_size: X_subset = X else: # if dataset is huge, agglomerate a subset subset_idxs = np.random.choice(np.arange(0, n), self.max_agglom_size) X_subset = X[subset_idxs, :] agg_clustering = agg.fit_predict(X_subset) onehot = _labels_to_onehot(agg_clustering) weights_init, means_init, precisions_init = _onehot_to_initial_params( X_subset, onehot, params[1]["covariance_type"]) gm_params = params[1] gm_params["weights_init"] = weights_init gm_params["means_init"] = means_init gm_params["precisions_init"] = precisions_init else: gm_params = params[1] gm_params["init_params"] = "kmeans" gm_params["reg_covar"] = 0 gm_params["max_iter"] = self.max_iter criter = np.inf # if none of the iterations converge, bic/aic is set to inf # below is the regularization scheme while gm_params["reg_covar"] <= 1 and criter == np.inf: model = GaussianMixture(**gm_params) try: model.fit(X) predictions = model.predict(X) counts = [ sum(predictions == i) for i in range(gm_params["n_components"]) ] # singleton clusters not allowed assert not any([count <= 1 for count in counts]) except ValueError: gm_params["reg_covar"] = _increase_reg(gm_params["reg_covar"]) continue except AssertionError: gm_params["reg_covar"] = _increase_reg(gm_params["reg_covar"]) continue # if the code gets here, then the model has been fit with no errors or # singleton clusters if self.selection_criteria == "bic": criter = model.bic(X) else: criter = model.aic(X) break if y is not None: self.predictions = model.predict(X) ari = adjusted_rand_score(y, self.predictions) else: ari = float("nan") results = { "model": model, "bic/aic": criter, "ari": ari, "n_components": gm_params["n_components"], "affinity": params[0]["affinity"], "linkage": params[0]["linkage"], "covariance_type": gm_params["covariance_type"], "reg_covar": gm_params["reg_covar"], } return results
def recommend_coldstart(song_input, songs_np, songs_df, num_recommend_gmm, num_recommend_nn, gmm_clusters): '''Generates song recommendations based on Nearest Neighbours and GMM sampling. Inputs song_inputs: Index of song that user likes. songs_np: Numpy array of numeric attributes of dataset. songs_df: Full dataframe. num_recommend_nn: Number of songs to recommend using NN. num_recommend_gmm: Number of songs to recommend using GMM sampling. gmm_clusters: Number of clusters for GMM model. Will find optimal if specified as 0. Outputs nn_recc_songs: Recommendations using NN. gmm_recc_songs: Recommendations using GMM. ''' query_song = songs_np[song_input] playlist_idx = songs_df[songs_df['track_uri'] == songs_df.iloc[song_input] ['track_uri']]['pid'].values query_songs_df = songs_df[songs_df['pid'].isin(playlist_idx)] idx = query_songs_df.drop_duplicates(subset=['track_uri']).index.values query_songs_np = songs_np[idx] if gmm_clusters == 0: #Do tuning print("Tuning hyperparameters for GMM.") n_clusters = np.arange(2, 10) sils = [] bics = [] iterations = 20 for n in tqdm(n_clusters): tmp_sil = [] tmp_bic = [] for _ in range(iterations): gmm = GaussianMixture(n, n_init=2).fit(query_songs_np) labels = gmm.predict(query_songs_np) sil = silhouette_score(query_songs_np, labels, metric='euclidean') tmp_sil.append(sil) tmp_bic.append(gmm.bic(query_songs_np)) val = np.mean(SelBest(np.array(tmp_sil), int(iterations / 5))) sils.append(val) val = np.mean(SelBest(np.array(tmp_bic), int(iterations / 5))) bics.append(val) gmm_clusters = int( (n_clusters[np.argmin(bics)] + n_clusters[np.argmax(sils)]) / 2) print("Optimal number of clusters: {}.".format(gmm_clusters)) print("Fitting models.") gmm = GaussianMixture(n_components=gmm_clusters).fit(query_songs_np) nn = NearestNeighbors().fit(query_songs_np) print("Generating recommendations.") #GMM sampling label_gmm = gmm.predict(query_song.reshape(1, -1))[0] #to ensure we get 10 recommendations num_being_recommended = 0 while num_being_recommended < num_recommend_gmm: samples = np.random.multivariate_normal(gmm.means_[label_gmm], gmm.covariances_[label_gmm], 2 * num_recommend_gmm) dist, indices = nn.kneighbors(samples, n_neighbors=1) #drop possible duplicates gmm_recc = list(set(indices.flatten()))[:num_recommend_gmm] num_being_recommended = len(gmm_recc) #NN dist, indices = nn.kneighbors(query_song.reshape(1, -1), n_neighbors=num_recommend_nn + 1) nn_recc = indices.flatten()[1:] nn_recc_songs = songs_df.iloc[idx].iloc[nn_recc] gmm_recc_songs = songs_df.iloc[idx].iloc[gmm_recc] return nn_recc_songs, gmm_recc_songs
labels[k]['Kmeans'] = km_labels labels[k]['GMM'] = gmm_labels sil[k]['Kmeans'] = sil_score(dataX, km_labels) sil[k]['GMM'] = sil_score(dataX, gmm_labels) km_sil_samples = sil_samples(dataX, km_labels) gmm_sil_samples = sil_samples(dataX, gmm_labels) for i, x in enumerate(km_sil_samples): sil_samp[j] = [k, 'Kmeans', round(x, 6), km_labels[i]] j += 1 for i, x in enumerate(gmm_sil_samples): sil_samp[j] = [k, 'GMM', round(x, 6), gmm_labels[i]] j += 1 sse[k] = km.score(dataX) ll[k] = gmm.score(dataX) bic[k] = gmm.bic(dataX) acc[k]['Kmeans'] = cluster_acc(dataY,km.predict(dataX)) acc[k]['GMM'] = cluster_acc(dataY,gmm.predict(dataX)) adj_mi[k]['Kmeans'] = ami(dataY,km.predict(dataX)) adj_mi[k]['GMM'] = ami(dataY,gmm.predict(dataX)) gmm_clusters = pd.DataFrame() kmeans_clusters = pd.DataFrame() for i in clusters: gmm_clusters[i] = labels[i]['GMM'] kmeans_clusters[i] = labels[i]['Kmeans'] bic = pd.DataFrame(bic, index=[0]).T
X = X.reshape(-1, 2) colors = (np.ones((N,1)) * np.arange(3)).reshape(-1) pl.figure() pl.scatter(X[:, 0], X[:, 1], c=colors, s=16, lw=0) pl.title('input data') n_components = np.arange(1, 16) BIC = np.zeros(n_components.shape) for i, n in enumerate(n_components): clf = GaussianMixture(n_components=n, covariance_type='diag') clf.fit(X) BIC[i] = clf.bic(X) pl.figure() pl.bar(n_components, BIC, label='BIC') pl.legend(loc=0) pl.xlabel('n_components') pl.ylabel('BIC') i_n = np.argmin(BIC) clf = GaussianMixture(n_components[i_n]) clf.fit(X) label = clf.predict(X) pl.figure() pl.scatter(X[:, 0], X[:, 1], c=label, s=16, lw=0)
N = N1 + N2 x1 = np.random.multivariate_normal(mean=(1, 2), cov=cov1, size=N1) m = np.array(((1, 1), (1, 3))) x1 = x1.dot(m) x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2) x = np.vstack((x1, x2)) y = np.array([0]*N1 + [1]*N2) types = ('spherical', 'diag', 'tied', 'full') err = np.empty(len(types)) bic = np.empty(len(types)) for i, type in enumerate(types): gmm = GaussianMixture(n_components=2, covariance_type=type, random_state=0) gmm.fit(x) err[i] = 1 - accuracy_rate(gmm.predict(x), y) bic[i] = gmm.bic(x) print('错误率:', err.ravel()) print('BIC:', bic.ravel()) xpos = np.arange(4) plt.figure(facecolor='w') ax = plt.axes() b1 = ax.bar(xpos-0.3, err, width=0.3, color='#77E0A0', edgecolor='k') b2 = ax.twinx().bar(xpos, bic, width=0.3, color='#FF8080', edgecolor='k') plt.grid(b=True, ls=':', color='#606060') bic_min, bic_max = expand(bic.min(), bic.max()) plt.ylim((bic_min, bic_max)) plt.xticks(xpos, types) plt.legend([b1[0], b2[0]], ('错误率', 'BIC')) plt.title('不同方差类型的误差率和BIC', fontsize=15) plt.show()
def gaussian_mixture( X, n_clusters=5, covariance_type="full", best_model=False, max_clusters=10, random_state=None, **kwargs, ): """Clustering with Gaussian Mixture Model. Parameters ---------- X : array-like n x k attribute data n_clusters : int, optional, default: 5 The number of clusters to form. covariance_type: str, optional, default: "full"" The covariance parameter passed to scikit-learn's GaussianMixture algorithm best_model: bool, optional, default: False Option for finding endogenous K according to Bayesian Information Criterion max_clusters: int, optional, default:10 The max number of clusters to test if using `best_model` option random_state: int, optional, default: None The seed used to generate replicable results kwargs Returns ------- fitted cluster instance: sklearn.mixture.GaussianMixture """ if random_state is None: warn( "Note: Gaussian Mixture Clustering is probabilistic--" "cluster labels may be different for different runs. If you need consistency, " "you should set the `random_state` parameter") if best_model is True: # selection routine from # https://plot.ly/scikit-learn/plot-gmm-selection/ lowest_bic = np.infty bic = [] maxn = max_clusters + 1 n_components_range = range(1, maxn) cv_types = ["spherical", "tied", "diag", "full"] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = GaussianMixture( n_components=n_components, random_state=random_state, covariance_type=cv_type, ) gmm.fit(X) bic.append(gmm.bic(X)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm bic = np.array(bic) model = best_gmm else: model = GaussianMixture( n_components=n_clusters, random_state=random_state, covariance_type=covariance_type, ) model.fit(X) model.labels_ = model.predict(X) return model
def fit_gmm( max_components, n_distances, atoms, distances, regularization_type="bic", covariance_type="diag", ): """ Fit a GMM to a set of distances. This routine will fit a Gaussian mixture model from a set of input distances using sklearn_. The resulting set of parameters can be used to initialize a `GMMDistanceRestraint` in a MELD simulation. .. _sklearn: http://scikit-learn.org/stable/modules/mixture.html Parameters ---------- max_components: int Maximum number of components to use in fitting GMM. n_distances: int Number of distances involved in GMM atoms: list of (int, str, int, str) tuples. The atoms that are involved in each distance are specified as a list of `n_distances` tuples, each of the form (r1, n1, r2, n2), where r1, r2 are the integer residue indices starting from one, and n1, n2 are the atom names. distances: array_like(n_dim=2) An (n_samples, n_distances) array of distances (in nm) to fit. regularization_type: str The type of regularization to use, options are "bic" and "dirichlet". covariance_type: str The form of the covariance matrix, options are "diag" and "full". Returns ------- GMMParams The fit parameters, which can be used to initialize a `meld.system.restraints.GMMDistanceRestraint` using ``GMMDistanceRestraint.from_params``. Notes ----- There are two ways to regularize in order to prevent over fitting. ``regularization_type="bic"`` will use the Bayesian information criterion to penalize models that have more parameters. When using ``bic``, The final number of components in the model will be less than or equal to `max_components`. ``regularization_type=dirichlet`` will use a Dirichlet process prior on the weight distributions. The final number of components in the model will always be equal to `max_components`, but most of the weights will be small. There are two forms for the covariance matrix, which differ in the number of parameters and expressiveness. ``covariance_type="diag"`` will fit using a diagonal covariance matrix. This has few parameters, but does not capture correlations between input distances. Typically, choosing ``"diag"`` will result in a model with more components. ``covariance_type="full"`` will fit using a full representation of the covariance matrix. This captures correlations between input distances, but has far more parameters and is potentially prone to over fitting. """ # # Constants # N_INIT = 25 MAX_ITER = 1000 KFOLD_SPLITS = 5 REG_COVAR = 1e-4 RANDOMSEARCH_TRIALS = 32 # # Check the inputs # if distances.shape[1] != n_distances: raise ValueError("distances must have shape (n_samples, n_distances)") if len(atoms) != n_distances: raise ValueError( "atoms must be a list of (ind1, name1, ind2, name2) of " "length n_components" ) if regularization_type not in ["bic", "dirichlet"]: raise ValueError('regularization_type must be one of ["bic", "dirichlet"]') if covariance_type not in ["diag", "full"]: raise ValueError('covariance_type must be one of ["diag", "full"]') if max_components < 1: raise ValueError("max_components must be >= 1") if max_components > 32: raise ValueError("MELD supports a maximum of 32 GMM components") # # Create and fit the model # if regularization_type == "bic": # BIC fit # Search different values of n_components to find the minimal # BIC. models = [] for i in range(1, max_components + 1): g = GaussianMixture( n_components=i, n_init=N_INIT, max_iter=MAX_ITER, covariance_type=covariance_type, reg_covar=REG_COVAR, ) g.fit(distances) models.append((g.bic(distances), g)) gmm = sorted(models, key=lambda x: x[0])[0][1] else: # Dirichlet process fit # use RandomSearchCV to optimize hyperparameters params = { "weight_concentration_prior": LogUniformSampler(1e-6, 10), "mean_precision_prior": LogUniformSampler(1, 10), } model = BayesianGaussianMixture( max_components, n_init=N_INIT, max_iter=MAX_ITER, covariance_type=covariance_type, reg_covar=REG_COVAR, ) rs = RandomizedSearchCV( model, param_distributions=params, n_iter=RANDOMSEARCH_TRIALS, cv=KFold(n_splits=KFOLD_SPLITS, shuffle=True), ) rs.fit(distances) gmm = rs.best_estimator_ # turn the vector representation of the diagonal into a full # precision matrix if covariance_type == "diag": precisions = gmm.precisions_ assert len(precisions.shape) == 2 new_precisions = [] for i in range(precisions.shape[0]): new_precisions.append(np.diag(precisions[i, :])) precisions = np.array(new_precisions) else: precisions = gmm.precisions_ # convert the list of atoms into the correct form new_atoms = [] for r1, n1, r2, n2 in atoms: new_atoms.append((r1, n1)) new_atoms.append((r2, n2)) # Return the parameters for a GMM return GMMParams( n_components=gmm.weights_.shape[0], n_distances=n_distances, atoms=new_atoms, weights=gmm.weights_, means=gmm.means_, precisions=precisions, )
def fit(self, X, y=None): """ Fits gaussian mixure model to the data. Estimate model parameters with the EM algorithm. Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. y : array-like, shape (n_samples,), optional (default=None) List of labels for X if available. Used to compute ARI scores. Returns ------- self """ # Deal with number of clusters if self.max_components is None: lower_ncomponents = 1 upper_ncomponents = self.min_components else: lower_ncomponents = self.min_components upper_ncomponents = self.max_components n_mixture_components = upper_ncomponents - lower_ncomponents + 1 if upper_ncomponents > X.shape[0]: if self.max_components is None: msg = "if max_components is None then min_components must be >= " msg += "n_samples, but min_components = {}, n_samples = {}".format( upper_ncomponents, X.shape[0]) else: msg = "max_components must be >= n_samples, but max_components = " msg += "{}, n_samples = {}".format(upper_ncomponents, X.shape[0]) raise ValueError(msg) elif lower_ncomponents > X.shape[0]: msg = "min_components must be <= n_samples, but min_components = " msg += "{}, n_samples = {}".format(upper_ncomponents, X.shape[0]) raise ValueError(msg) # Get parameters random_state = self.random_state param_grid = dict( covariance_type=self.covariance_type, n_components=range(lower_ncomponents, upper_ncomponents + 1), random_state=[random_state], ) param_grid = list(ParameterGrid(param_grid)) models = [[] for _ in range(n_mixture_components)] bics = [[] for _ in range(n_mixture_components)] aris = [[] for _ in range(n_mixture_components)] for i, params in enumerate(param_grid): model = GaussianMixture(**params) model.fit(X) models[i % n_mixture_components].append(model) bics[i % n_mixture_components].append(model.bic(X)) if y is not None: predictions = model.predict(X) aris[i % n_mixture_components].append( adjusted_rand_score(y, predictions)) self.bic_ = pd.DataFrame( np.array(bics), index=np.arange(lower_ncomponents, upper_ncomponents + 1), columns=self.covariance_type, ) if y is not None: self.ari_ = pd.DataFrame( np.array(aris), index=np.arange(lower_ncomponents, upper_ncomponents + 1), columns=self.covariance_type, ) else: self.ari_ = None # Finding the minimum bic for each covariance structure bic_mins = [min(bic) for bic in bics] bic_argmins = [np.argmin(bic) for bic in bics] # Find the index for the minimum bic amongst all covariance structures model_type_argmin = np.argmin(bic_mins) self.n_components_ = np.argmin(bics[model_type_argmin]) + 1 self.model_ = models[model_type_argmin][bic_argmins[model_type_argmin]] return self
# load the included diabetes dataset diab = load_diabetes(as_frame=True) # view information about the columns print(diab.DESCR) diab_df = diab.data print(diab.target) # since we are not performing regression, we can add the target # column diab_df['s7'] = diab.target # print a summary of our data print(diab_df.describe()) em_gaussian = GaussianMixture(n_components=4, init_params='random', covariance_type='full') cluster_preds = em_gaussian.fit_predict(diab_df) plt.title('Gaussian Mixture Clusters') # we can pick two dimensions of the input data in order to visualize clusters # in R^2. Note that this output will look different depending on which # dimensions you choose to plot plt.xlabel('bmi') plt.ylabel('bp') plt.scatter(diab_df['bmi'], diab_df['bp'], c=cluster_preds, cmap='rainbow') plt.savefig('simple_diabetes_clusters.png', dpi=300) # view the akaike information criterion print(em_gaussian.aic(diab_df)) # view the bayesian information criterion print(em_gaussian.bic(diab_df))
lowest_bic = np.infty # we'll compare BIC scores for four different CV types and # 6 different numbers of components (clusters) to choose the "best" # model n_components_range = range(1, 7) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: scores = [] for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type) gmm.fit(X) curr_bic = gmm.bic(X) scores.append(curr_bic) # update tracking variables if new lowest BIC found if curr_bic < lowest_bic: lowest_bic = curr_bic best_gmm = gmm plt.plot(n_components_range, scores, label=cv_type) # now we can inspect the "best" model, as decided by BIC score print('CV:', best_gmm.covariance_type, '| #Components:', best_gmm.n_components, '| BIC:', lowest_bic) plt.legend() plt.savefig('BIC_plot.png', dpi=300)
def _fit_cluster( self, X: np.ndarray, X_subset: np.ndarray, y: Optional[np.ndarray], params: ParamGridType, agg_clustering: Union[List[int], np.ndarray], seed: int, ) -> Dict[str, Any]: label_init = self.label_init if label_init is not None: onehot = _labels_to_onehot(label_init) weights_init, means_init, precisions_init = _onehot_to_initial_params( X, onehot, params[1]["covariance_type"]) gm_params = params[1] gm_params["weights_init"] = weights_init gm_params["means_init"] = means_init gm_params["precisions_init"] = precisions_init elif params[0]["affinity"] != "none": onehot = _labels_to_onehot(agg_clustering) weights_init, means_init, precisions_init = _onehot_to_initial_params( X_subset, onehot, params[1]["covariance_type"]) gm_params = params[1] gm_params["weights_init"] = weights_init gm_params["means_init"] = means_init gm_params["precisions_init"] = precisions_init else: gm_params = params[1] gm_params["init_params"] = "kmeans" gm_params["reg_covar"] = 0 gm_params["max_iter"] = self.max_iter gm_params["random_state"] = seed criter = np.inf # if none of the iterations converge, bic/aic is set to inf # below is the regularization scheme while gm_params["reg_covar"] <= 1 and criter == np.inf: model = GaussianMixture(**gm_params) try: # ignoring warning here because if convergence is not reached, # the regularization is automatically increased with warnings.catch_warnings(): warnings.simplefilter("ignore", ConvergenceWarning) model.fit(X) predictions = model.predict(X) counts = [ sum(predictions == i) for i in range(gm_params["n_components"]) ] # singleton clusters not allowed assert not any([count <= 1 for count in counts]) except ValueError: gm_params["reg_covar"] = _increase_reg(gm_params["reg_covar"]) continue except AssertionError: gm_params["reg_covar"] = _increase_reg(gm_params["reg_covar"]) continue # if the code gets here, then the model has been fit with no errors or # singleton clusters if self.selection_criteria == "bic": criter = model.bic(X) else: criter = model.aic(X) break if y is not None: self.predictions = model.predict(X) ari = adjusted_rand_score(y, self.predictions) else: ari = float("nan") results = { "model": model, "bic/aic": criter, "ari": ari, "n_components": gm_params["n_components"], "affinity": params[0]["affinity"], "linkage": params[0]["linkage"], "covariance_type": gm_params["covariance_type"], "reg_covar": gm_params["reg_covar"], } return results
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ TODO: YP description **Positional Arguments:** inputs: - A matrix **Optional Arguments:** dim: - The number of clusters in which to assign the data """ if self._embedding is None: self._embedding = inputs[0] nodeIDs = inputs[1] nodeIDS = np.array([int(i) for i in nodeIDs]) max_clusters = self.hyperparams['max_clusters'] if max_clusters < self._embedding.shape[1]: self._embedding = self._embedding[:, :max_clusters].copy() cov_types = ['full', 'tied', 'diag', 'spherical'] clf = GaussianMixture(n_components=1, covariance_type='spherical') clf.fit(self._embedding) BIC_max = -clf.bic(self._embedding) cluster_likelihood_max = 1 cov_type_likelihood_max = "spherical" for i in range(1, max_clusters): for k in cov_types: clf = GaussianMixture(n_components=i, covariance_type=k) clf.fit(self._embedding) current_bic = -clf.bic(self._embedding) if current_bic > BIC_max: BIC_max = current_bic cluster_likelihood_max = i cov_type_likelihood_max = k clf = GaussianMixture(n_components=cluster_likelihood_max, covariance_type=cov_type_likelihood_max) clf.fit(self._embedding) predictions = clf.predict(self._embedding) testing = inputs[2] testing_nodeIDs = np.asarray(testing['G1.nodeID']) testing_nodeIDs = np.array([int(i) for i in testing_nodeIDs]) final_labels = np.zeros(len(testing)) for i in range(len(testing_nodeIDs)): #temp = np.where(self._nodeIDs == int(testing_nodeIDs[i]))[0][0] label = predictions[i] #print(label) final_labels[i] = int(label) + 1 testing['classLabel'] = final_labels outputs = container.DataFrame(testing[['d3mIndex', 'classLabel']]) outputs[['d3mIndex', 'classLabel']] = outputs[['d3mIndex', 'classLabel']].astype(int) #outputs = container.DataFrame(testing['classLabel']) return base.CallResult(outputs)
def clustering_experiment(X, y, name, clusters, rdir): """Generate results CSVs for given datasets using the K-Means and EM clustering algorithms. Args: X (Numpy.Array): Attributes. y (Numpy.Array): Labels. name (str): Dataset name. clusters (list[int]): List of k values. rdir (str): Output directory. """ sse = defaultdict(dict) # sum of squared errors logl = defaultdict(dict) # log-likelihood bic = defaultdict(dict) # BIC for EM aic = defaultdict(dict) # AIC for EM aic = defaultdict(dict) # AIC for EM silhouette = defaultdict(dict) # silhouette score acc = defaultdict(lambda: defaultdict(dict)) # accuracy scores adjmi = defaultdict(lambda: defaultdict(dict)) # adjusted mutual info h**o = defaultdict(lambda: defaultdict(dict)) # adjusted mutual info km = KMeans(random_state=0) # K-Means gmm = GMM(random_state=0) # Gaussian Mixture Model (EM) # start loop for given values of k print('DATESET: %s' % name) for k in clusters: print('K: %s' % k) km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(X) gmm.fit(X) # calculate SSE, log-likelihood, accuracy, and adjusted mutual info sse[k][name] = km.score(X) logl[k][name] = gmm.score(X) acc[k][name]['km'] = cluster_acc(y, km.predict(X)) acc[k][name]['gmm'] = cluster_acc(y, gmm.predict(X)) adjmi[k][name]['km'] = ami(y, km.predict(X)) adjmi[k][name]['gmm'] = ami(y, gmm.predict(X)) h**o[k][name]['km'] = homogeneity_score(y, km.predict(X)) h**o[k][name]['gmm'] = homogeneity_score(y, gmm.predict(X)) # calculate silhouette score for K-Means km_silhouette = silhouette_score(X, km.predict(X)) silhouette[k][name] = km_silhouette # calculate BIC for EM bic[k][name] = gmm.bic(X) aic[k][name] = gmm.aic(X) # generate output dataframes sse = (-pd.DataFrame(sse)).T sse.rename(columns={name: 'sse'}, inplace=True) logl = pd.DataFrame(logl).T logl.rename(columns={name: 'log-likelihood'}, inplace=True) bic = pd.DataFrame(bic).T bic.rename(columns={name: 'bic'}, inplace=True) aic = pd.DataFrame(aic).T aic.rename(columns={name: 'aic'}, inplace=True) silhouette = pd.DataFrame(silhouette).T silhouette.rename(columns={name: 'silhouette_score'}, inplace=True) acc = pd.Panel(acc) acc = acc.loc[:, :, name].T.rename(lambda x: '{}_acc'.format(x), axis='columns') adjmi = pd.Panel(adjmi) adjmi = adjmi.loc[:, :, name].T.rename(lambda x: '{}_adjmi'.format(x), axis='columns') h**o = pd.Panel(h**o) h**o = h**o.loc[:, :, name].T.rename(lambda x: '{}_homo'.format(x), axis='columns') # concatenate all results dfs = (sse, silhouette, logl, bic, aic, acc, adjmi, h**o) metrics = pd.concat(dfs, axis=1) print(metrics) resfile = get_abspath('{}_train_metrics.csv'.format(name), rdir) metrics.to_csv(resfile, index_label='k')
def cluster_segments(self) -> None: """Clusters the input segments :attr:`self.raw_segments` based on the parameters passed as argument. """ Logger.debug("Clustering segments") if self.params.cluster_type not in ["gmm", "knn"]: Logger.fatal("Invalid value for cluster type: {}".format( self.params.cluster_type)) raise ValueError( "Invalid value for 'cluster_type': {} " "'cluster_type' should be in ['gmm', 'knn']".format( self.params.cluster_type)) centers = [] angles = [] for segment in self.raw_segments: pt1 = segment[0:2] pt2 = segment[2:4] center = (pt1 + pt2) * 0.5 centers.append(center) # Segment angle lies in [0, pi], multiply by 2 such that complex number associated to similar angles are # close on the complex plane (e.g. 180° and 0°) angle = tg.utils.angle(pt1, pt2) * 2 # Need to use complex representation as Euclidean distance used in clustering makes sense in complex plane, # and does not directly on angles. point = np.array([np.cos(angle), np.sin(angle)]) angles.append(point) centers = np.array(centers) centers = normalize(centers, axis=0) angles = np.array(angles) if self.params.use_angles and self.params.use_centers: features = np.hstack((angles, centers)) elif self.params.use_angles: features = angles elif self.params.use_centers: features = centers else: raise RuntimeError( "Can not perform segment clustering without any feature. " "Select 'use_angles=True' and/or 'use_centers=True'.") cluster_prediction = None if self.params.cluster_type is "knn": Logger.debug("Clustering segments using KNN") cluster_prediction = KMeans(n_clusters=self.params.num_clusters, n_init=self.params.num_init, random_state=0).fit_predict(features) elif self.params.cluster_type is "gmm": Logger.debug("Clustering segments using GMM") best_gmm = None lowest_bic = np.infty bic = [] n_components_range = range(1, self.params.num_clusters + 1) if not self.params.swipe_clusters: n_components_range = [self.params.num_clusters] for n_components in n_components_range: # Fit a Gaussian mixture with EM. gmm = GaussianMixture(n_components=n_components, covariance_type='full') gmm.fit(features) bic.append(gmm.bic(features)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm cluster_prediction = best_gmm.predict(features) # Reorder the segments as clusters. cluster_segment_list = [] cluster_feature_list = [] num_labels = np.max(cluster_prediction) + 1 for label in range(num_labels): cluster_segments = self.raw_segments[cluster_prediction == label] if len(cluster_segments) == 0: continue cluster_features = features[cluster_prediction == label] cluster_segment_list.append(cluster_segments) cluster_feature_list.append(cluster_features) self.cluster_list = cluster_segment_list self.cluster_features = cluster_feature_list
def fit(self, X, Y, epochs, batch_size): EPOCHS = epochs BATCH_SIZE = batch_size n = len(X) XY = np.concatenate((X, Y), axis=1) #df = n - 1 self._X = X.copy() hidden_neurons = self.hidden_neurons if self.n_mixtures == -1: lowest_bic = np.infty bic = [] n_components_range = range(1, 7) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type, max_iter=10000) gmm.fit(XY) bic.append(gmm.bic(XY)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm self.n_mixtures = n_components clusterer = HDBSCAN() clusterer.fit(XY) clusterer.labels_ if len(np.unique(clusterer.labels_)) < self.n_mixtures: self.n_mixtures = len(np.unique(clusterer.labels_)) else: pass if self.gmm_boost == True: if len(np.unique(clusterer.labels_)) < self.n_mixtures: clusterer = HDBSCAN() clusterer.fit(X) clusters = clusterer.labels_ else: clusterer = best_gmm clusterer.fit(X) clusters = clusterer.predict_proba(X) self._clusterer = clusterer X = np.concatenate((X, clusters), axis=1) else: pass elif self.gmm_boost == True: clusterer1 = BayesianGaussianMixture(n_components=self.n_mixtures, covariance_type='full', max_iter=10000) clusterer1.fit(X) clusters = clusterer1.predict_proba(X) self._clusterer = clusterer1 clusterer2 = HDBSCAN() clusterer2.fit(X) if len(np.unique(clusterer2.labels_)) < self.n_mixtures: clusters = clusterer2.labels_ self._clusterer = clusterer2 else: pass X = np.concatenate((X, clusters), axis=1) else: pass self._y = Y.copy() dataset = tf.compat.v1.data.Dataset \ .from_tensor_slices((X, Y)) \ .repeat(EPOCHS).shuffle(len(X)).batch(BATCH_SIZE) iter_ = tf.compat.v1.data.make_one_shot_iterator(dataset) x, y = iter_.get_next() K = self.n_mixtures self.K = K self.x = x input_activation = self.input_activation hidden_activation = self.hidden_activation if input_activation.lower() == 'crelu': input_actv = tf.nn.crelu elif input_activation.lower() == 'relu6': input_actv = tf.nn.relu6 elif input_activation.lower() == 'elu': input_actv = tf.nn.elu elif input_activation.lower() == 'selu': input_actv = tf.nn.selu elif input_activation.lower() == 'leaky_relu': input_actv = tf.nn.leaky_relu elif input_activation.lower() == 'relu': input_actv = tf.nn.relu elif input_activation.lower() == 'swish': input_actv = tf.nn.swish elif input_activation.lower() == 'tanh': input_actv = tf.nn.tanh elif input_activation.lower() == 'linear': input_actv = None elif input_activation.lower() == 'softplus': input_actv = tf.nn.softplus elif input_activation.lower() == 'sigmoid': input_actv = tf.nn.sigmoid elif input_activation.lower() == 'softmax': input_actv = tf.nn.softmax else: input_actv = tf.nn.relu if hidden_activation.lower() == 'crelu': h_actv = tf.nn.crelu elif hidden_activation.lower() == 'relu6': h_actv = tf.nn.relu6 elif hidden_activation.lower() == 'elu': h_actv = tf.nn.elu elif hidden_activation.lower() == 'selu': h_actv = tf.nn.selu elif hidden_activation.lower() == 'leaky_relu': h_actv = tf.nn.leaky_relu elif hidden_activation.lower() == 'relu': h_actv = tf.nn.relu elif hidden_activation.lower() == 'swish': h_actv = tf.nn.swish elif hidden_activation.lower() == 'tanh': h_actv = tf.nn.tanh elif hidden_activation.lower() == 'linear': h_actv = None elif hidden_activation.lower() == 'softplus': h_actv = tf.nn.softplus elif hidden_activation.lower() == 'sigmoid': h_actv = tf.nn.sigmoid elif hidden_activation.lower() == 'softmax': h_actv = tf.nn.softmax else: h_actv = tf.nn.relu n_layer = len(hidden_neurons) if n_layer < 1: self.layer_last = tf.layers.dense(x, units=self.input_neurons, activation=input_actv) self.mu = tf.layers.dense(self.layer_last, units=K, activation=None, name="mu") self.var = tf.exp( tf.layers.dense(self.layer_last, units=K, activation=None, name="sigma")) self.pi = tf.layers.dense(self.layer_last, units=K, activation=tf.nn.softmax, name="mixing") else: self.layer_1 = tf.layers.dense(x, units=self.input_neurons, activation=input_actv) for i in range(2, n_layer + 2): n_neurons = hidden_neurons[i - 2] if i == n_layer + 1: print('last', i) string_var = 'self.layer_last = tf.layers.dense(self.layer_' + str( i - 1) + ', units=n_neurons, activation=h_actv)' else: print(i) string_var = 'self.layer_' + str( i) + ' = tf.layers.dense(self.layer_' + str( i - 1) + ', units=n_neurons, activation=h_actv)' exec(string_var) self.mu = tf.layers.dense(self.layer_last, units=K, activation=None, name="mu") self.var = tf.exp( tf.layers.dense(self.layer_last, units=K, activation=None, name="sigma")) self.pi = tf.layers.dense(self.layer_last, units=K, activation=tf.nn.softmax, name="mixing") if self.tf_mixture_family == False: #---------------- Not using TF Mixture Family ------------------------ if self.dist.lower() == 'normal': self.likelihood = tfp.distributions.Normal(loc=self.mu, scale=self.var) elif (self.dist.lower() == 'laplacian' or self.dist.lower() == 'laplace') == True: self.likelihood = tfp.distributions.Laplace(loc=self.mu, scale=self.var) elif self.dist.lower() == 'lognormal': self.likelihood = tfp.distributions.LogNormal(loc=self.mu, scale=self.var) elif self.dist.lower() == 'gamma': alpha = (self.mu**2) / self.var beta = self.var / self.mu self.likelihood = tfp.distributions.Gamma(concentration=alpha, rate=beta) else: self.likelihood = tfp.distributions.Normal(loc=self.mu, scale=self.var) self.out = self.likelihood.prob(y) self.out = tf.multiply(self.out, self.pi) self.out = tf.reduce_sum(self.out, 1, keepdims=True) self.out = -tf.log(self.out + 1e-10) self.mean_loss = tf.reduce_mean(self.out) else: # -------------------- Using TF Mixture Family ------------------------ self.mixture_distribution = tfp.distributions.Categorical( probs=self.pi) if self.dist.lower() == 'normal': self.distribution = tfp.distributions.Normal(loc=self.mu, scale=self.var) elif (self.dist.lower() == 'laplacian' or self.dist.lower() == 'laplace') == True: self.distribution = tfp.distributions.Laplace(loc=self.mu, scale=self.var) elif self.dist.lower() == 'lognormal': #self.distribution = tfp.edward2.LogNormal(loc=self.mu, scale=self.var) self.distribution = tfp.distributions.LogNormal(loc=self.mu, scale=self.var) elif self.dist.lower() == 'gamma': alpha = (self.mu**2) / self.var beta = self.var / self.mu self.distribution = tfp.distributions.Gamma( concentration=alpha, rate=beta) else: self.distribution = tfp.distributions.Normal(loc=self.mu, scale=self.var) self.likelihood = tfp.distributions.MixtureSameFamily( mixture_distribution=self.mixture_distribution, components_distribution=self.distribution) self.log_likelihood = -self.likelihood.log_prob(tf.transpose(y)) self.mean_loss = tf.reduce_mean(self.log_likelihood) # ---------------------------------------------------------------------- self.global_step = tf.Variable(0, trainable=False) if self.optimizer.lower() == 'adam': self.train_op = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'adadelta': self.train_op = tf.compat.v1.train.AdadeltaOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'adagradda': self.train_op = tf.compat.v1.train.AdagradDAOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'adagrad': self.train_op = tf.compat.v1.train.AdagradOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'ftrl': self.train_op = tf.compat.v1.train.FtrlOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'gradientdescent': self.train_op = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'momentum': self.train_op = tf.compat.v1.train.MomentumOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'proximaladagrad': self.train_op = tf.compat.v1.train.ProximalAdagradOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'proximalgradientdescent': self.train_op = tf.compat.v1.train.ProximalGradientDescentOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) elif self.optimizer.lower() == 'rmsprop': self.train_op = tf.compat.v1.train.RMSPropOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) else: self.train_op = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate).minimize(self.mean_loss) self.init = tf.compat.v1.global_variables_initializer() # Initialize coefficients self.sess = tf.compat.v1.Session() self.sess.run(self.init) best_loss = 1e+10 self.stopping_step = 0 for i in range(EPOCHS * (n // BATCH_SIZE)): _, loss, mu, var, pi, x__ = self.sess.run([ self.train_op, self.mean_loss, self.mu, self.var, self.pi, self.x ]) if loss < best_loss: self.stopping_step = 0 self.best_loss = loss best_mu = mu best_var = var best_pi = pi best_mean_y = mu[:, 0] best_x = x__ best_loss = loss print("Epoch: {} Loss: {:3.3f}".format(i, loss)) else: self.stopping_step += 1 if self.stopping_step >= self.early_stopping: self.should_stop = True print("Early stopping is trigger at step: {} loss:{}".format( i, loss)) return else: pass self._mean_y_train = mu[:, 0] self._dist_mu_train = mu self._dist_var_train = var self._dist_pi_train = pi self._x_data_train = x__
def gmm_information_criteria_report( X_mat, k=np.arange(1, 20), covar_type=['full', 'tied', 'diag', 'spherical'], random_seed=11238, out="Graph"): # Dataframe transposing closure type funct tmp_global_aic, tmp_global_bic = [], [] for i in covar_type: tmp_iter_aic, tmp_iter_bic = [], [] for j in k: tmp_model = GaussianMixture(j, covariance_type=i, random_state=random_seed).fit(X_mat) tmp_iter_aic.append(tmp_model.aic(X_mat)) tmp_iter_bic.append(tmp_model.bic(X_mat)) tmp_global_aic.append(tmp_iter_aic) tmp_global_bic.append(tmp_iter_bic) covar_type = covar_type tmp_get_aic = handle_df(tmp_global_aic, covar_type) tmp_get_bic = handle_df(tmp_global_bic, covar_type) tmp_get_aic_max = pd.melt(tmp_get_aic, id_vars=['n_components'], value_vars=covar_type).sort_values(by='value') tmp_get_bic_max = pd.melt(tmp_get_bic, id_vars=['n_components'], value_vars=covar_type).sort_values(by='value') tmp_top_aic = tmp_get_aic_max.head(3) tmp_top_bic = tmp_get_bic_max.head(3) if out is "Graph": plt.subplot(2, 1, 1) for colname, index in tmp_get_aic.drop( columns='n_components').iteritems(): plt.plot(index, label=colname) plt.scatter(tmp_top_aic['n_components'], tmp_top_aic['value'], edgecolors='slategrey', facecolor='none', lw=2, label="Best hyperparams") plt.title('Akaike Information Criteria') plt.xticks(k - 1, k) plt.xlabel('Number of clusters estimated') plt.ylabel('AIC') plt.legend() plt.subplot(2, 1, 2) for colname, index, in tmp_get_bic.drop( columns='n_components').iteritems(): plt.plot(index, label=colname) plt.scatter(tmp_top_bic['n_components'], tmp_top_bic['value'], edgecolors='slategrey', facecolor='none', lw=2, label="Best hyperparams") plt.title('Bayesian Information Criteria') plt.xticks(k - 1, k) plt.xlabel('Number of clusters estimated') plt.ylabel('BIC') plt.legend() elif out is not "Graph": return tmp_get_aic_max, tmp_get_bic_max
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X[:, 1:9]) X[:, 1:9] = imputer.transform(X[:, 1:9]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1) lowest_bic = np.infty bic = [] for n_components in range(1, 4): classifier = GaussianMixture(n_components=n_components, covariance_type='full') classifier.fit(X_train, y_train) bic.append(classifier.bic(X)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = classifier y_pred = best_gmm.predict(X_test) score = accuracy_score(y_pred, y_test) print(score) # # Applying k-Fold Cross Validation # from sklearn.model_selection import cross_val_score # accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10) # print(accuracies.mean()) # print(accuracies.std())
print(gmm.covariances_) #now repeat but use AIC and BIC to identify optimum number of components comp = np.arange(1, 9, 1) ncomp = np.shape(comp)[0] aic = [] bic = [] axes = [] yres = [] for i in range(ncomp): cnow = comp[i] gmm = GaussianMixture(n_components=cnow) gmm.fit(xin) aic.append(gmm.aic(xin)) bic.append(gmm.bic(xin)) yres.append(np.exp(gmm.score_samples(xr))) print 'ncomponents...', cnow, ' aic', aic[i], ' bic', bic[i] #make plots of all the tested number of components fig = plt.figure() i_best = np.argmin(aic) for i in range(ncomp): cnow = comp[i] ax1 = fig.add_subplot(np.int(np.ceil(ncomp / 2.)), 2, i + 1) axes.append(ax1) ilo = 0 for i2 in range(ndis): if (i2 > 0): ilo = ihi ihi = ilo + nsamp[i2]
counter += iterations xcoords[-1] = 2 * ds y = meanMatrix[0].reshape(-1, 1) #clustering = KMeans(n_clusters=k).fit(meanMatrix[0].reshape(-1,1)) bics = [] aics = [] for kk in range(1, 7): gmm = GaussianMixture(n_components=kk, covariance_type='spherical', random_state=1991) gmm.fit(y) bics.append(gmm.bic(y)) aics.append(gmm.aic(y)) kkBest = 1 + np.argmin(bics) gmm = GaussianMixture(n_components=kkBest, covariance_type='spherical', random_state=1991) gmm.fit(y) labels = gmm.predict(y) fig = plt.figure() #0 coverage, 1 global, 2 local colors = np.array(['C' + str(i) for i in range(kkBest)]) alphas = np.array([0.2, 1.])
AIC = np.zeros((T,)) CVE = np.zeros((T,)) # K-fold crossvalidation CV = model_selection.KFold(n_splits=10, shuffle=True) for t, K in enumerate(KRange): print('Fitting model for K={0}'.format(K)) # Fit Gaussian mixture model gmm = GaussianMixture(n_components=K, covariance_type=covar_type, n_init=reps, init_params=init_procedure, tol=1e-6, reg_covar=1e-6).fit(X_norm) # Get BIC and AIC BIC[t,] = gmm.bic(X_norm) AIC[t,] = gmm.aic(X_norm) # For each crossvalidation fold for train_index, test_index in CV.split(X_norm): # extract training and test set for current CV fold X_train = X_norm[train_index] X_test = X_norm[test_index] # Fit Gaussian mixture model to X_train gmm = GaussianMixture(n_components=K, covariance_type=covar_type, n_init=reps).fit(X_train) # compute negative log likelihood of X_test CVE[t] += -gmm.score_samples(X_test).sum() opt_clust = KRange[CVE.argmin()]
m = np.array(((1, 1), (1, 3))) x1 = x1.dot(m) #对x1进行旋转,方差都发生变化 x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2) x = np.vstack((x1, x2)) y = np.array([0] * N1 + [1] * N2) types = ('spherical', 'diag', 'tied', 'full') #圆形 对角线 两方差相同 可不同 err = np.empty(len(types)) bic = np.empty(len(types)) for i, type in enumerate(types): gmm = GaussianMixture(n_components=2, covariance_type=type, random_state=0) gmm.fit(x) err[i] = 1 - accuracy_rate(gmm.predict(x), y) bic[i] = gmm.bic(x) print('错误率:', err.ravel()) print('BIC:', bic.ravel()) xpos = np.arange(4) ax = plt.axes() b1 = ax.bar(xpos - 0.3, err, width=0.3, color='#77E0A0') b2 = ax.twinx().bar(xpos, bic, width=0.3, color='#FF8080') #ax1.twinx()#产生一个ax1的镜面坐标 plt.grid(True) bic_min, bic_max = expand(bic.min(), bic.max()) plt.ylim((bic_min, bic_max)) plt.xticks(xpos, types) plt.legend([b1[0], b2[0]], (u'错误率', u'BIC')) plt.title(u'不同方差类型的误差率和BIC', fontsize=18) plt.show()
em_fitness_times = [] for k in Kclusters: t1 = time.time() em = GaussianMixture(n_components=k, covariance_type='diag', n_init=1, warm_start=True, random_state=100).fit(X1) t2 = time.time() em_fitness_times.append(t2 - t1) em_sil_scores.append(silhouette_score(X1, em.predict(X1))) em_homo_scores.append(homogeneity_score(Y1, em.predict(X1))) em_aic_scores.append(em.aic(X1)) em_bic_scores.append(em.bic(X1)) # [KM] Plot the Cluster Score over K cluster plt.title("Cluster Score for K Mean (KM) on ICA " + Dataset) plt.xlabel("K cluster") plt.ylabel("Inertia") #plt.ylim(0.0, 1.1) lw = 2 plt.plot(Kclusters, km_inertia_scores, label="inertia", color="navy", lw=lw) plt.legend(loc="best") plt.show() # [KM] Plot the Fitness Time over K cluster plt.title("Fitness Time for K Mean (KM) on ICA " + Dataset) plt.xlabel("K cluster") plt.ylabel("Fitness Time (s)")
def em(X_train, X_test, y_train, y_test, no_iter=1000, component_list=[3, 4, 5, 6, 7, 8, 9, 10, 11], num_class=7, toshow=1, file_no=1): array_aic = [] array_bic = [] array_homo = [] array_comp = [] array_sil = [] array_avg_log = [] for num_classes in component_list: clf = GaussianMixture(n_components=num_classes, covariance_type='spherical', max_iter=no_iter, init_params='kmeans') # clf = KMeans(n_clusters= num_classes, init='k-means++') clf.fit(X_train) y_test_pred = clf.predict(X_test) #Per sample average log likelihood avg_log = clf.score(X_test) array_avg_log.append(avg_log) #AIC on the test data aic = clf.aic(X_test) array_aic.append(aic) #BIC on the test data bic = clf.bic(X_test) array_bic.append(bic) #Homogenity score on the test data h**o = metrics.homogeneity_score(y_test, y_test_pred) array_homo.append(h**o) #Completeness score comp = metrics.completeness_score(y_test, y_test_pred) array_comp.append(comp) #Silhoutette score sil = metrics.silhouette_score(X_test, y_test_pred, metric='euclidean') array_sil.append(sil) #Generating plots fig1, ax1 = plt.subplots() ax1.plot(component_list, array_aic) ax1.plot(component_list, array_bic) plt.legend(['AIC', 'BIC']) plt.xlabel('Number of clusters') plt.title('AIC/BIC curve for Expected Maximization') if (toshow == 1): plt.savefig(file_no + "em1") fig2, ax2 = plt.subplots() ax2.plot(component_list, array_homo) ax2.plot(component_list, array_sil) plt.legend(['homogenity', 'silhoutette']) plt.xlabel('Number of clusters') plt.title('Performance evaluation scores for Expected Maximization') if (toshow == 1): plt.savefig(file_no + "em2") fig3, ax3 = plt.subplots() ax3.plot(component_list, array_avg_log) plt.xlabel('Number of clusters') plt.title('Per sample average log likelihood for Expected Maximization') if (toshow == 1): plt.savefig(file_no + "em3") plt.show() #Training and testing accuracy for K = number of classes clf = GaussianMixture(n_components=num_class, covariance_type='spherical', max_iter=no_iter, init_params='kmeans') #Assigning the initial means as the mean feature vector for the class clf.fit(X_train) #Training accuracy y_train_pred = clf.predict(X_train) train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100 print('Training accuracy for Expected Maximization for K = {}: {}'.format( num_class, train_accuracy)) #Testing accuracy y_test_pred = clf.predict(X_test) test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100 print('Testing accuracy for Expected Maximization for K = {}: {}'.format( num_class, test_accuracy)) return component_list, array_aic, array_bic, array_homo, array_comp, array_sil, array_avg_log
visualizer.fit(results) # Fit the data to the visualizer # Finalize and render the figure visualizer.show( outpath="charts/income.k-means.randomization.SilhouetteVisualizer.png") lowest_bic = np.infty bic = [] n_components_range = range(1, 7) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type) gmm.fit(results) bic.append(gmm.bic(results)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm bic = np.array(bic) color_iter = itertools.cycle( ['navy', 'turquoise', 'cornflowerblue', 'darkorange']) clf = best_gmm bars = [] # Plot the BIC scores plt.figure(figsize=(8, 6)) spl = plt.subplot(2, 1, 1) for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)): xpos = np.array(n_components_range) + .2 * (i - 2)
def __do_perform(self, custom_out=None, main_experiment=None ): # ./output/ICA/clustering//{}', ICAExperiment if custom_out is not None: # if not os.path.exists(custom_out): # os.makedirs(custom_out) self._old_out = self._out # './output/ICA/{}' self._out = custom_out # ./output/ICA/clustering//{}' elif self._old_out is not None: self._out = self._old_out if main_experiment is not None: self.log("Performing {} as part of {}".format( self.experiment_name(), main_experiment.experiment_name())) # 'clustering', 'ICA' else: self.log("Performing {}".format(self.experiment_name())) # Adapted from https://github.com/JonathanTay/CS-7641-assignment-3/blob/master/clustering.py # %% Data for 1-3 sse = defaultdict(list) ll = defaultdict(list) bic = defaultdict(list) sil = defaultdict(lambda: defaultdict(list)) sil_s = np.empty(shape=(2 * len(self._clusters) * self._details.ds.training_x.shape[0], 4), dtype='<U21') acc = defaultdict(lambda: defaultdict(float)) adj_mi = defaultdict(lambda: defaultdict(float)) km = kmeans(random_state=self._details.seed) gmm = GMM(random_state=self._details.seed) st = clock() j = 0 for k in self._clusters: km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit( self._details.ds.training_x ) # cluster the ICA-transformed input features using kMeans with varying K gmm.fit( self._details.ds.training_x ) # cluster the ICA-transformed input features using GMM with varying k km_labels = km.predict( self._details.ds.training_x ) # give each ICA-transformed input feature a label gmm_labels = gmm.predict(self._details.ds.training_x) sil[k]['Kmeans'] = sil_score( self._details.ds.training_x, km_labels ) # compute mean silhouette score for all ICA-transformed input features sil[k]['GMM'] = sil_score(self._details.ds.training_x, gmm_labels) km_sil_samples = sil_samples( self._details.ds.training_x, km_labels ) # compute silhouette score for each ICA-transformed input feature gmm_sil_samples = sil_samples(self._details.ds.training_x, gmm_labels) # There has got to be a better way to do this, but I can't brain right now for i, x in enumerate(km_sil_samples): sil_s[j] = [ k, 'Kmeans', round(x, 6), km_labels[i] ] # record the silhouette score x for each instance i given its label kn_labels[i] by kMeans with value k j += 1 for i, x in enumerate(gmm_sil_samples): sil_s[j] = [k, 'GMM', round(x, 6), gmm_labels[i]] j += 1 sse[k] = [ km.score(self._details.ds.training_x) ] # score (opposite of the value of X on the k-Means objective (what is the objective???) ll[k] = [gmm.score(self._details.ds.training_x) ] # per-sample average log-likelihood bic[k] = [ gmm.bic(self._details.ds.training_x) ] # bayesian information criterion (review ???) on the input X acc[k]['Kmeans'] = cluster_acc( self._details.ds.training_y, km_labels ) # compute the accuracy of the clustering algorithm on the ICA-transformed data (against the original y-label) if it predicted the majority y-label for each cluster acc[k]['GMM'] = cluster_acc(self._details.ds.training_y, gmm_labels) adj_mi[k]['Kmeans'] = ami( self._details.ds.training_y, km_labels ) # compute the adjusted mutual information between the true labels and the cluster predicted labels (how well does clustering match truth) adj_mi[k]['GMM'] = ami(self._details.ds.training_y, gmm_labels) self.log("Cluster: {}, time: {}".format(k, clock() - st)) sse = (-pd.DataFrame(sse)).T sse.index.name = 'k' sse.columns = ['{} sse (left)'.format(self._details.ds_readable_name) ] # Bank sse (left) ll = pd.DataFrame(ll).T ll.index.name = 'k' ll.columns = [ '{} log-likelihood'.format(self._details.ds_readable_name) ] # Bank log-likelihood bic = pd.DataFrame(bic).T bic.index.name = 'k' bic.columns = ['{} BIC'.format(self._details.ds_readable_name) ] # Bank BIC sil = pd.DataFrame(sil).T sil_s = pd.DataFrame(sil_s, columns=['k', 'type', 'score', 'label']).set_index('k') #.T # sil_s = sil_s.T acc = pd.DataFrame(acc).T adj_mi = pd.DataFrame(adj_mi).T sil.index.name = 'k' sil_s.index.name = 'k' acc.index.name = 'k' adj_mi.index.name = 'k' # write scores to files sse.to_csv(self._out.format('{}_sse.csv'.format( self._details.ds_name))) ll.to_csv( self._out.format('{}_logliklihood.csv'.format( self._details.ds_name))) bic.to_csv(self._out.format('{}_bic.csv'.format( self._details.ds_name))) sil.to_csv( self._out.format('{}_sil_score.csv'.format(self._details.ds_name))) sil_s.to_csv( self._out.format('{}_sil_samples.csv'.format( self._details.ds_name))) acc.to_csv(self._out.format('{}_acc.csv'.format( self._details.ds_name))) adj_mi.to_csv( self._out.format('{}_adj_mi.csv'.format(self._details.ds_name))) # %% NN fit data (2,3) # train a NN on clustered data grid = { 'km__n_clusters': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) km = kmeans(random_state=self._details.seed, n_jobs=self._details.threads) pipe = Pipeline( [('km', km), ('NN', mlp)], memory=experiments.pipeline_memory ) # run a NN on the clustered data (only on the cluster labels, or input features + cluster labels???) gs, _ = self.gs_with_best_estimator( pipe, grid, type='kmeans') # write the best NN to file self.log("KMmeans Grid search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_cluster_kmeans.csv'.format( self._details.ds_name)) ) # write grid search results --> bank_cluster_kmeans.csv grid = { 'gmm__n_components': self._clusters, 'NN__alpha': self._nn_reg, 'NN__hidden_layer_sizes': self._nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=self._details.seed) gmm = CustomGMM(random_state=self._details.seed) pipe = Pipeline([('gmm', gmm), ('NN', mlp)], memory=experiments.pipeline_memory) gs, _ = self.gs_with_best_estimator( pipe, grid, type='gmm') # write the best NN to file self.log("GMM search complete") tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv( self._out.format('{}_cluster_GMM.csv'.format( self._details.ds_name)) ) # write grid search results --> bank_cluster_GMM.csv # %% For chart 4/5 # perform TSNE D.R on training data (why???) self._details.ds.training_x2D = TSNE( verbose=10, random_state=self._details.seed).fit_transform( self._details.ds.training_x) ds_2d = pd.DataFrame( np.hstack((self._details.ds.training_x2D, np.atleast_2d(self._details.ds.training_y).T)), columns=['x', 'y', 'target'] ) # prepare NN-learnable data using TSNE D.R'd input features + label ds_2d.to_csv( self._out.format('{}_2D.csv'.format( self._details.ds_name))) # --> bank_2D.csv self.log("Done")
w = np.exp(-np.exp(3 * w.mean(axis=1))) # gmm model selection with bic: lowest_bic = np.infty bic = [] n_components_range = range(1, 7) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a mixture of Gaussians with EM gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type, n_init=5) gmm.fit(X) bic.append(gmm.bic(X)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm preds = best_gmm.predict(X) probs = best_gmm.predict_proba(X) for name, col in zip(cv_types, np.array(bic).reshape(-1, len(cv_types)).T): plt.plot(n_components_range, col, label=name) plt.legend() plt.savefig('gmm_sklearn_bic/bic.pdf') data_thr['preds'] = pd.Series(preds).astype("category")