def performance_score(input_values, cluster_indexes, true_indexes): try: silh_score = metrics.silhouette_score(input_values, cluster_indexes) print(' .. Silhouette Coefficient score is {:.2f}'.format(silh_score)) print(' ... -1: incorrect, 0: overlapping, +1: highly dense clusters.') except: print(' .. Warning: could not calculate Silhouette Coefficient score.') silh_score = -999 try: ch_score = metrics.calinski_harabasz_score(input_values, cluster_indexes) print(' .. Calinski-Harabasz Index score is {:.2f}'.format(ch_score)) print(' ... Higher the value better the clusters.') except: print( ' .. Warning: could not calculate Calinski-Harabasz Index score.') ch_score = -999 try: db_score = metrics.davies_bouldin_score(input_values, cluster_indexes) print(' .. Davies-Bouldin Index score is {:.2f}'.format(db_score)) print(' ... 0: Lowest possible value, good partitioning.') except: print( ' .. Warning: could not calculate Davies-Bouldin Index Index score.' ) db_score = -999 try: ars = metrics.adjusted_rand_score(true_indexes, cluster_indexes) print(' .. adjusted rand score is {:.2f}'.format(ars)) print(' ... Perfect labeling is scored 1.0 Bounded range [-1, 1]') except: print(' .. Warning: could not calculate adjusted rand score.') ars = -999 return silh_score, ch_score, db_score, ars
def calKmeans(X): kmeans = KMeans(n_clusters=5, random_state=0).fit(X) center = kmeans.cluster_centers_ result = kmeans.labels_ # 非监督式评估方法 # 平均轮廓系数 silhouette_s = silhouette_score(X, kmeans.labels_, metric='euclidean') # calinski和harabaz得分 calinski_harabaz_s = calinski_harabasz_score(X, kmeans.labels_) print('silhouette_s: %f \n calinski_harabaz_s: %f' % (silhouette_s, calinski_harabaz_s)) # print(result) temp = sorted(center) print(temp) # print(center) # print(temp) index = [] for i in range(len(temp)): print(temp.index(center[i])) index.append(temp.index(center[i])) # print(index) re = {} i = 0 for key in py: re[key] = int(result[i]) i += 1 # print(re) # print(type(re)) difficulty = [0, 0, 0, 0, 0] detail = ["最简单", "较简单", "中等", "较难", "最难"] for key in re: print(key, end=" ") difficulty[index[re[key]]] += 1 print(detail[index[re[key]]]) for i in range(len(difficulty)): print(detail[i], ":" + str(difficulty[i])) # plt.axes(aspect='equal') # plt.pie(difficulty, labels=detail, autopct='%.0f%%') plt.title("各分数段频数")
def get_marks(self, data, true_labels, predicted_labels): """获取评分,有五种需要知道数据集的实际分类信息,参考readme.txt :data: 待分析数据 :true_labels: 真正分类标签 :predicted_labels: 模型预测分类标签 """ print(30 * '*', "model performance", 30 * '*') print("Homogeneity Score (均一性): ", metrics.homogeneity_score(true_labels, predicted_labels)) print("Completeness Score (完整性): ", metrics.completeness_score(true_labels, predicted_labels)) print("V-Measure Score (V量): ", metrics.v_measure_score(true_labels, predicted_labels)) print("Adjusted Rand Score (调整后兰德指数): ", metrics.adjusted_rand_score(true_labels, predicted_labels)) print( "Adjusted Mutual Info Score(调整后的共同信息): ", metrics.adjusted_mutual_info_score(true_labels, predicted_labels)) print("Calinski Harabasz Score: (方差比指数) ", metrics.calinski_harabasz_score(data, predicted_labels)) print("Silhouette Score (轮廓分数): ", metrics.silhouette_score(data, predicted_labels))
def compute_kmeans_scores(X, n): davies_bouldin_scores = [] distortions = [] silhouette_scores = [] calinski_harabasz_scores = [] times = [] for i in range(2, n + 1): start_time = time.time() km = KMeans(n_clusters=i, **km_arguements) km.fit(X) time_taken = time.time() - start_time times.append(time_taken) distortions.append(km.inertia_) davies_bouldin_scores.append(davies_bouldin_score(X, km.labels_)) silhouette_avg = silhouette_score(X, km.labels_) silhouette_scores.append(silhouette_avg) calinski_harabasz_scores.append(calinski_harabasz_score(X, km.labels_)) print( "For n_clusters = {} average silhouette_score: {} time taken: {}s". format(i, silhouette_avg, time_taken)) return distortions, davies_bouldin_scores, silhouette_scores, calinski_harabasz_scores, times
def clusteringAffinityPropagation(self, X, verbose): pref = [-1,-2,-3,-4,-5,-6,-7,-8,-9,-10] best_pref = 0 best_sil = 0 for i in pref: model = AffinityPropagation(preference=i, max_iter=500).fit(X) cluster_centers_indices = model.cluster_centers_indices_ labels = model.labels_ if len(set(labels)) <= 1 or len(set(labels)) > len(X)-1: continue sil = calinski_harabasz_score(X, labels) # sil = silhouette_score(X, labels, metric='sqeuclidean') if sil > best_sil: best_sil = sil best_pref = i model = AffinityPropagation().fit(X) cluster_centers_indices = model.cluster_centers_indices_ labels = model.labels_ n_clusters_ = len(cluster_centers_indices) if verbose: print('Estimated number of clusters: %d' % n_clusters_) colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] plt.plot(X[class_members, 0], X[class_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) for x in X[class_members]: plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.show() return model.labels_, n_clusters_
def create_histograms(path, views_folder): histograms = {} for level in next(os.walk(path))[1]: labels = read_results(os.path.join(path, level, 'labels.npy')) # graph = read_results(os.path.join(path, level, 'graph.npy')) views = load_views(views_folder, level) print(level) for view in views[level]: print( 'Silhouette', view, round( metrics.silhouette_score(views[level][view], labels, metric='euclidean'), 4)) print( 'Calinski-Harabasz', view, round( metrics.calinski_harabasz_score(views[level][view], labels), 4)) print( 'Davies-Bouldin', view, round(metrics.davies_bouldin_score(views[level][view], labels), 4)) d = {} for i in labels: if i in d: d[i] += 1 else: d[i] = 1 big, small = float('-inf'), float('inf') for i in d: big = max(big, d[i]) small = min(small, d[i]) histograms[level] = [ d, level + ', n: ' + str(len(labels)) + ', k: ' + str(len(d)) + ', Biggest: ' + str(big) + ', Smallest: ' + str(small) ] return histograms
def spectral_explore_neighbors(features_vector, max_neighbors=30): metrics = [] #Find number neighbors if max_neighbors > features_vector.shape[0]: max_neighbors = features_vector.shape[0] // 2 number = range(2, max_neighbors + 1) for i in tqdm(number): #fix n-cluster = 2, evaluate using calinski and silhouette spectral = SpectralClustering(n_clusters=2, eigen_solver='arpack', affinity="nearest_neighbors", assign_labels="discretize", random_state=0, n_neighbors=i, n_jobs=-1).fit((features_vector)) labels = spectral.labels_ metrics.append([ silhouette_score( features_vector, labels, metric='cosine'), # silhouette : best 1, worst -1 calinski_harabasz_score(features_vector, labels) ]) # calinski: the higher the better df = pd.DataFrame(metrics, index=number, columns=['silhouette', 'calinski']) df.plot(y=['silhouette', 'calinski'], subplots=True, sharex=True, figsize=(10, 12), fontsize=14, linewidth=2) ##best_neighbors: The mean of the indices where the first derivative of the metrics is maximum. best_neighbors = int(round((df.diff().idxmax().mean()))) print("Best number of neighbors is {:d}".format(best_neighbors)) return best_neighbors
def main(): # 过滤警告 warnings.filterwarnings("ignore") # 创建“点滴”数据 # x, y = samples_generator.make_blobs(n_samples=200, centers=2, cluster_std=1, random_state=0) # 创建“月牙”数据 # x, y = samples_generator.make_moons(n_samples=200, noise=0.05, random_state=0) # 创建“环形”数据 x, y = samples_generator.make_circles(n_samples=200, noise=0.05, random_state=0, factor=0.4) """ 创建七种聚类算法 """ # clusters = cluster.KMeans(2) # K-means++ # clusters = cluster.MeanShift() # 均值迁移 # clusters = cluster. AgglomerativeClustering(2) # 层聚类 # clusters = cluster.AffinityPropagation() # AP聚类 # clusters = cluster.SpectralClustering(n_clusters=2, affinity="nearest_neighbors") # 谱聚类 # clusters = cluster.DBSCAN(eps=0.55, min_samples=5) # 密度聚类 clusters = GaussianMixture(n_components=2) # 高斯分布 # 拟合 _x = clusters.fit_predict(x) """ 三种评价方法 """ # 1.轮廓系数 print(metrics.silhouette_score(x, _x)) # 2.CH指数 print(metrics.calinski_harabasz_score(x, _x)) # 3.戴维森堡丁指数 print(metrics.davies_bouldin_score(x, _x)) # 绘图 plt.scatter(x[:, 0], x[:, 1], c=_x, cmap='viridis') plt.show()
def HierarchicalClustering(n_clusters, links, affs, clust_col, X, **kwargs): # initialize df_scores and df_clust df_scores = pd.DataFrame(columns=clust_col, index=pd.MultiIndex.from_product( [links, affs], names=['links', 'affs'])) df_clust = pd.DataFrame() # go through the possible links and affs for aff in affs: for link in links: # ward can only be used with euclidean if link == 'ward' and aff != 'euclidean': continue print("\t\tlink: {}, aff: {}".format(link, aff)) # do the clustering fit = AgglomerativeClustering(n_clusters=n_clusters, affinity=aff, linkage=link) # get the predicted labels labels = fit.fit_predict(X) df_clust = df_clust.append( pd.DataFrame({ 'True': X.index.values, 'Predicted': labels, "n_clusters": n_clusters, "link-aff": link + "_" + aff })) # save the chs and ss -scores df_scores.loc[(link, aff), "CHS"] = calinski_harabasz_score(X, labels) df_scores.loc[(link, aff), "SS"] = silhouette_score(X, labels) # if class_column is specified, save NMI-score if len(clust_col) == 3: df_scores.loc[(link, aff), "NMI"] = normalized_mutual_info_score( X.index.values, labels, average_method='geometric') return df_scores, df_clust
def clustering(self, ax, K_range, K_offset=0): XT = self.data[self.columns_latent_states].values XT = StandardScaler().fit_transform(XT) K_range = np.array(K_range) f = lambda K: AgglomerativeClustering( n_clusters=K, linkage='ward', ).fit_predict(XT) CH = np.fromiter((calinski_harabasz_score(XT, f(K)) for K in K_range), dtype=float) K_opt = K_range[CH.argmax() + K_offset] ax.scatter(K_range, CH, marker='x', color=np.where(K_range == K_opt, 'C1', 'C0')) print(f'K_opt = {K_opt}') y = f(K_opt) print(f'#clusters = {len(set(y) - {-1})}, #-1 = {(y == -1).sum()}') self.data['cluster_raw'] = y self.data['cluster'] = list(map(str, y))
def bench_EM(estimator, labels, name, data, sample_size, n_clusters, random_state, filename, verbose=False): t0 = time() estimator.fit(data) fit_time = time() - t0 h**o = 0 # metrics.homogeneity_score(labels, estimator.predict(data)) comp = 0 # metrics.completeness_score(labels, estimator.predict(data)) v_meas = metrics.v_measure_score(labels, estimator.predict(data)) ari = metrics.adjusted_rand_score(labels, estimator.predict(data)) ami = metrics.adjusted_mutual_info_score(labels, estimator.predict(data)) fks = 0 # metrics.fowlkes_mallows_score(labels, estimator.predict(data)) silo = metrics.silhouette_score(data, estimator.predict(data), metric='euclidean', sample_size=sample_size, random_state=random_state) dbs = metrics.davies_bouldin_score(data, estimator.predict(data)) chs = metrics.calinski_harabasz_score(data, estimator.predict(data)) aics = 0 # estimator.aic(data) # Akaike information criterion for the current model on the input X. Lower is better bics = estimator.bic(data) # Bayesian information criterion for the current model on the input X. Lower is Better scor = estimator.score(data) if verbose: print('%-9s\t%d\t%.2fs\t%.2f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%i\t%i\t%i\t%.3f' % (name, n_clusters, fit_time, h**o, comp, v_meas, ari, ami, fks, silo, dbs, chs, aics, bics, scor) ) return fit_time, h**o, comp, v_meas, ari, ami, fks, silo, dbs, chs, aics, bics, scor
def measurescore(test_data, y_true, y_pred): print(y_pred.shape) y_pred = y_pred.reshape(-1, ) # 无label_true: # 1.CH分数 Calinski Harabasz Score, 取值越大越好 score_ch = metrics.calinski_harabasz_score(test_data, y_pred) # 2.轮廓系数(Silhouette Coefficient, 取值-1, 1之间 取值越大越好 # score_sc = metrics.silhouette_score(test_data, y_pred) # 戴维森堡丁指数(DBI)——davies_bouldin_score, 取值越小越好 score_db = metrics.davies_bouldin_score(test_data, y_pred) # label_true: # 1.Mutual Information based scores 互信息 [0,1] 取值越大越好 score_mi = metrics.adjusted_mutual_info_score(y_true, y_pred) # 调整兰德系数 (Adjusted Rand index) [-1,1]取值越大越好 score_adi = metrics.adjusted_rand_score(y_true, y_pred) # v_measure_score homogeneity+completeness [0,1] 取值越大越好 score_vm = metrics.v_measure_score(y_true, y_pred) result_score = [score_ch, score_db, score_mi, score_adi, score_vm] return result_score
def hierarchical(myData): print('**hierarchical start**') # Create clusters k = 3 hier = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='ward') cluster_labels = hier.fit_predict(myData) # Determine if the clustering is good silhouette_avg = metrics.silhouette_score(myData, cluster_labels) calinski_avg = metrics.calinski_harabasz_score(myData, cluster_labels) print("For n_clusters =", k, "The average silhouette_score is :", silhouette_avg) print("For n_clusters =", k, "The average calinski_harabaz_score is :", calinski_avg) childrens = hier.children_ # print(childrens) plotPCA(myData, cluster_labels, '_hierachical') print('**hierarchical end**') print()
def evaluation_Score(features, y_pred, output_df, model): try: num_labels = len(set(y_pred)) total_samples = len(y_pred) if (num_labels == 1 or num_labels == total_samples): output_df.loc[model, 'silhouette'] = -1 output_df.loc[model, 'calinski'] = -1 output_df.loc[model, 'davies'] = -1 else: output_df.loc[model, 'silhouette'] = metrics.silhouette_score( features, y_pred) output_df.loc[model, 'calinski'] = metrics.calinski_harabasz_score( features, y_pred) output_df.loc[model, 'davies'] = metrics.davies_bouldin_score( features, y_pred) except Exception as e: print(e) pass return output_df
def plot_tsne(ax, curnpz, name, cluster_path='./data_clean/results/'): labels = np.load("{}{}/features_conv_v27.npz".format(cluster_path, name))['labels'][:, 1] conv = np.load("{}{}/features_conv_v27.npz".format(cluster_path, name))['feature_vector'] cal = calinski_harabasz_score(conv, labels) X_embedded = TSNE(n_components=2, perplexity=15, learning_rate=10, random_seed=0).fit_transform(conv) classes = np.unique(labels) cmap = plt.cm.get_cmap('brg', len(classes)) im = ax.scatter(X_embedded[:, 0], X_embedded[:, 1], c=labels, cmap=cmap) ax.figure.colorbar(im, ax=ax, ticks=classes) title = "Cluster audio, T-SNE representation" ax.set(title=title) space = int(len(conv) * 0.1) add_direction(ax, X_embedded[:, 0], X_embedded[:, 1], space, 5) return ax
def cal_plot(self): y_pred = SpectralClustering(n_clusters=self.K, gamma=self.gamma).fit_predict(self.data) class1 = np.array([self.data[i] for i in range(self.N) if y_pred[i] == 0]) class2 = np.array([self.data[i] for i in range(self.N) if y_pred[i] == 1]) class3 = np.array([self.data[i] for i in range(self.N) if y_pred[i] == 2]) plt.plot(class1[:, 2], class1[:, 3], 'co', label="class1") plt.plot(class2[:, 2], class2[:, 3], 'yo', label="class2") plt.plot(class3[:, 2], class3[:, 3], 'go', label="class3") plt.legend(loc="best") plt.title("Spectral Clustering dim 2/3") plt.show() plt.plot(class1[:, 0], class1[:, 1], 'co', label="class1") plt.plot(class2[:, 0], class2[:, 1], 'yo', label="class2") plt.plot(class3[:, 0], class3[:, 1], 'go', label="class3") plt.legend(loc="best") plt.title("Spectral Clustering dim 0/1") plt.show() print("class1", " ", len(class1)) print("class2", " ", len(class2)) print("class3", " ", len(class3)) # print(y_pred) print("Calinski-Harabasz Score", metrics.calinski_harabasz_score(self.data, y_pred)) print("silhouette_scores", metrics.silhouette_score(self.data, y_pred))
def assessment(self): inertias = [] silhouette = [] calinski_harabasz = [] for i in range(3, 9): kmeans = KMeans(n_clusters=i, copy_x=True).fit(self.data) inertias.append(kmeans.inertia_) silhouette.append(silhouette_score(self.data, kmeans.labels_)) calinski_harabasz.append( calinski_harabasz_score(self.data, kmeans.labels_)) plt.xlabel("k") plt.ylabel("inertia") X = range(3, 9) plt.plot(X, inertias, "o-") plt.show() plt.xlabel("k") plt.ylabel("silhouette_score") plt.plot(X, silhouette, "o-") plt.show() plt.xlabel("k") plt.ylabel("calinski_harabasz_score") plt.plot(X, calinski_harabasz, "o-") plt.show()
def test_gmm_cluster(): X, y = make_blobs(n_features=13, n_samples=1000, centers=4, cluster_std=[2, 5, 3, 5], random_state=100) X = StandardScaler().fit_transform(X) # 多维数据,根据聚类指标选择超参数n_clusters for i, k in enumerate((2, 3, 4, 5)): # cluster = KMeans(n_clusters=k, max_iter=500, random_state=100) cluster = GaussianMixture(n_components=k, random_state=100) y_pred = cluster.fit_predict(X) # 轮廓系数,介于(-1,1),越接近1越好, score_si = metrics.silhouette_score(X, y_pred) # calinski_harabaz分数,越大越好 # 一般来说,Silhouette Coefficient要比Calinski-Harabasz Index的结果准确一些,但轮廓系数计算复杂度更高 score_ch = metrics.calinski_harabasz_score(X, y_pred) print(k, score_si, score_ch) # 降维后画图看聚类效果 # cluster = KMeans(n_clusters=4, max_iter=500, random_state=100) cluster = GaussianMixture(n_components=4, random_state=100) y_pred = cluster.fit_predict(X) data = pd.DataFrame(X) data['pre_class'] = y_pred de = TSNE(n_components=2, random_state=0).fit_transform(X) # de = PCA(n_components=2).fit_transform(X) # d = de[data['pre_class']==0] # plt.plot(d[:, 0], d[:, 1], 'r.') # d = de[data['pre_class']==1] # plt.plot(d[:, 0], d[:, 1], 'go') # d = de[data['pre_class']==2] # plt.plot(d[:, 0], d[:, 1], 'b*') # d = de[data['pre_class']==3] # plt.plot(d[:, 0], d[:, 1], 'y+') plt.scatter(de[:, 0], de[:, 1], c=y_pred) plt.show()
def best_kmeans(x, k_nums, centers): all_measures = [] all_kmeans = [] for clusters in k_nums: kmeans = KMeans(n_clusters=k_nums[clusters], init=centers[clusters]).fit(x) all_kmeans.append(kmeans) measure = [] measure.append(silhouette_score(x, kmeans)) measure.append(calinski_harabasz_score(x, kmeans)) measure.append(davies_bouldin_score(x, kmeans)) measure.append(len(set(kmeans))) measure.append(0) sil, dav, cal, num_clusters, man_f = give_graph_arrays(all_measures) sil=scale(sil) dav=scale(dav) cal=scale(cal) best_index=0 best_score=sil[0]+cal[0]-dav[0] for x in range(len(sil)): current_calc=sil[x]+cal[x]-dav[x] if(num_clusters[x]==1): continue if(current_calc>best_score): best_index=x best_score=current_calc return (all_kmeans[best_index], all_measures[best_index])
def clusteringAgglomerativeClustering (self, X, n_c, verbose): best_n_cluster = 0 best_sil = 0 if n_c == 0: for i in range(1, 11): model = AgglomerativeClustering(n_clusters=i).fit(X) labels = model.labels_ if len(set(labels)) <= 1: continue sil = calinski_harabasz_score(X, labels) # sil = silhouette_score(X, labels, metric='sqeuclidean') if sil > best_sil: best_sil = sil best_n_cluster = i model = AgglomerativeClustering(n_clusters=best_n_cluster).fit(X) labels = model.labels_ n_clusters_ = model.n_clusters_ if verbose: print('Estimated number of clusters: %d' % n_clusters_) else: model = AgglomerativeClustering(n_clusters=n_c).fit(X) labels = model.labels_ n_clusters_ = model.n_clusters_ if verbose: colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = labels == k plt.scatter(X[class_members, 0], X[class_members, 1], s=200, c=col) # plt.plot(X[class_members, 0], X[class_members, 1], col + '.') plt.title('Estimated number of clusters: %d' % n_clusters_) plt.show() return model.labels_, n_clusters_
def visualise_calinski_harabasz(subfolder, dataset, results): indexes = [i for i in range(len(results))] names = [] scores = [] colors = [] image_path = f"../../images/{subfolder}/{dataset}" if not os.path.exists(image_path): os.makedirs(image_path) for r in results: names.append(r["name"]) if r['name'].startswith("KUB0462"): colors.append('r') else: colors.append('b') items, labels = labelize_clusters(r["clusters"]) score = metrics.calinski_harabasz_score(items, labels) scores.append(score) name = r['name'] print(f'calinski: {name} - {score}') fig, ax = plt.subplots() width, height = get_size(results) fig.set_size_inches(width, height) ax.set_xticks(indexes) ax.set_xticklabels(names) ax.bar(indexes, scores, color=colors) plt.margins(0.01) plt.title(f"{dataset.upper()}: Calinski Harabasz Score") plt.savefig(image_path + "/calinski_harabasz_score.png")
def calculate_extrinsic_metrics(dataset, real_classes, predicted_classes): confusion_matrix = matriz_confusion(real_classes, predicted_classes) return { 'Error': medida_error(confusion_matrix), 'Pureza': medida_pureza(confusion_matrix), 'F1': medida_f1(confusion_matrix), 'Entropía': medida_entropia(confusion_matrix), 'Información mútua': metrics.mutual_info_score(real_classes, predicted_classes), 'ARI': metrics.adjusted_rand_score(real_classes, predicted_classes), 'Homogeneidad': metrics.homogeneity_score(real_classes, predicted_classes), 'Completación': metrics.completeness_score(real_classes, predicted_classes), 'Medida V': metrics.v_measure_score(real_classes, predicted_classes), 'Fowlkes-Mallows': metrics.fowlkes_mallows_score(real_classes, predicted_classes), 'Silhouette': metrics.silhouette_score(dataset, predicted_classes, metric='euclidean'), 'Calinski-Harabasz': metrics.calinski_harabasz_score(dataset, predicted_classes), 'Davies-Bouldin': davies_bouldin_score(dataset, predicted_classes), 'media': (medida_pureza(confusion_matrix) + medida_f1(confusion_matrix) + metrics.mutual_info_score( real_classes, predicted_classes) + metrics.adjusted_rand_score(real_classes, predicted_classes) + metrics.homogeneity_score( real_classes, predicted_classes) + metrics.completeness_score(real_classes, predicted_classes) + metrics.v_measure_score( real_classes, predicted_classes) + metrics.fowlkes_mallows_score(real_classes, predicted_classes)) / 8 }
def get_marks(estimator, data, name=None): """获取评分,有五种需要知道数据集的实际分类信息,有三种不需要,参考readme.txt :param estimator: 模型 :param name: 初始方法 :param data: 特征数据集 """ estimator.fit(data) print(20 * '*', name, 20 * '*') print("Homogeneity Score: ", metrics.homogeneity_score(labels, estimator.labels_)) print("Completeness Score: ", metrics.completeness_score(labels, estimator.labels_)) print("V Measure Score: ", metrics.v_measure_score(labels, estimator.labels_)) print("Adjusted Rand Score: ", metrics.adjusted_rand_score(labels, estimator.labels_)) print("Adjusted Mutual Info Score: ", metrics.adjusted_mutual_info_score(labels, estimator.labels_)) print("Calinski Harabasz Score: ", metrics.calinski_harabasz_score(data, estimator.labels_)) print("Silhouette Score: ", metrics.silhouette_score(data, estimator.labels_))
def internalValidation(data, clusters): scores = {} """ The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering. Scores around zero indicate overlapping clusters. The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster. """ scores['silhouette_score'] = metrics.silhouette_score(data, clusters, metric='euclidean') """ The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster. The score is fast to compute """ scores['calinski_harabaz_score'] = metrics.calinski_harabasz_score( data, clusters) """ Zero is the lowest possible score. Values closer to zero indicate a better partition. The Davies-Boulding index is generally higher for convex clusters than other concepts of clusters, such as density based clusters like those obtained from DBSCAN. """ scores['davies_bouldin_score'] = metrics.davies_bouldin_score( data, clusters) return scores
def train_model(): kmms = [] X = G['dist'][G['dist'] > 0].reshape(-1, 1) for i in range(1, max_clusters + 1): km = KMeans(n_clusters=i) yhat = km.fit_predict(X) try: score = calinski_harabasz_score(X, yhat) except ValueError: score = np.inf label_means = [(label, X[yhat == label].mean()) for label in set(yhat)] min_mean_label, min_mean = min(label_means, key=lambda x: x[1]) max_mean_label, max_mean = max(label_means, key=lambda x: x[1]) kmms.append((i, score, min_mean_label, min_mean, max_mean_label, max_mean, km)) k, scr, G['min_dist_label'], _, _, max_mean, G['kmm'] = min( kmms, key=lambda x: x[1]) k1mean = kmms[0][3] dist_factor = abs(k1mean - max_mean) / max_mean if dist_factor < CONFIG["ft_dist_factor"]: k, scr, G['min_dist_label'], _, _, _, G['kmm'] = kmms[0] logging.debug('k=%s, score=%s, min_dist_label=%s, dist_factor=%s', k, scr, G['min_dist_label'], dist_factor)
def clustererEvaluationMetric(clusterer_container): #print('S3 clustererEvaluationMetric>>') import numpy as np from sklearn import metrics clusterer,X,SX,y,size,clusters_num,metricstring = unPackClustererContainer(clusterer_container) # 各集群百分比 #performanceDict = {'time':t1-t0,'clusters_num':max(clusterer.labels_)+1,'cluster_elements_percent':[0.4,0.2,0.1],'silhouette':0.23,'calinski':342.9} cluster_elements = [(clusterer.labels_==i).sum() for i in range(-1,clusters_num)] cluster_elements_percent = np.array(cluster_elements)/len(X) clusterer_container['performance']['cluster_elements_percent'] = cluster_elements_percent # 数学指标 choicenmetrics = {} if 'r' in metricstring: m = metrics.adjusted_rand_score(y, clusterer.labels_) clusterer_container['performance']['a-randscore'] = m if 'm' in metricstring: m = metrics.adjusted_mutual_info_score(y, clusterer.labels_) clusterer_container['performance']['a-mutualinfo'] = m if 's' in metricstring: sample_size = len(X) # 聚类数量小于1是没有该指数的 #print('labels:',clusterer.labels_,'max labels',max(clusterer.labels_),'sample_size:',sample_size) try: m = -9999 if clusters_num <= 1 else metrics.silhouette_score(X, clusterer.labels_,metric='euclidean',sample_size=sample_size) except: m = -9999 clusterer_container['performance']['silhouette'] = m if 'c' in metricstring: # 聚类数量小于1是没有该指数的 try: m = -9999 if clusters_num <= 1 else metrics.calinski_harabasz_score(X, clusterer.labels_) except: m = -9999 clusterer_container['performance']['calinski'] = m #print('S3 done.<<<') return clusterer_container
def kmeans(self): try: if self.dane.empty: self.showDialog('Dane nie zostały wczytane') return daneKMeans = self.dane.copy() daneKMeans = daneKMeans.astype(float64) if self.ui.comboBoxKMeansMetryka.currentText() in 'cosine': daneKMeans = preprocessing.normalize(daneKMeans, norm='l2') KM = kmeans(daneKMeans, n_clusters=self.ui.spinBoxKMeansIloscKlastrow.value(), metric=self.ui.comboBoxKMeansMetryka.currentText(), maxiter=1000, verbose=0) labelsKM = KM[1] if self.ui.radioButtonMiaraSilhouette.isChecked(): score_silhouette_KM = silhouette_score( self.dane, labelsKM, metric=self.ui.comboBoxKMeansMetryka.currentText()) score_silhouette_KM = round(score_silhouette_KM, 4) self.ui.txtWynikSprawdzenia.setText(str(score_silhouette_KM)) elif self.ui.radioButtonMiaraDaviesBoudlin.isChecked(): score_DaviesBoudlin_KM = davies_bouldin_score( self.dane, labelsKM) score_DaviesBoudlin_KM = round(score_DaviesBoudlin_KM, 4) self.ui.txtWynikSprawdzenia.setText( str(score_DaviesBoudlin_KM)) elif self.ui.radioButtonMiaraCelinskiHarabasz.isChecked(): score_CalinskiHarabasz_KM = calinski_harabasz_score( self.dane, labelsKM) score_CalinskiHarabasz_KM = round(score_CalinskiHarabasz_KM, 4) self.ui.txtWynikSprawdzenia.setText( str(score_CalinskiHarabasz_KM)) except Exception as inst: self.showDialog(inst)
def get_marks(estimator, data, name=None): """获取评分,有五种需要知道数据集的实际分类信息,有三种不需要,参考readme.txt :param estimator: 模型 :param name: 初始方法 :param data: 特征数据集 """ estimator.fit(data.astype(np.float64)) print(30 * '*', name, 30 * '*') print(" 模型及参数: ", estimator) print("Homogeneity Score (均一性): ", metrics.homogeneity_score(labels, estimator.labels_)) print("Completeness Score (完整性): ", metrics.completeness_score(labels, estimator.labels_)) print("V-Measure Score (V量): ", metrics.v_measure_score(labels, estimator.labels_)) print("Adjusted Rand Score (调整后兰德指数): ", metrics.adjusted_rand_score(labels, estimator.labels_)) print("Adjusted Mutual Info Score(调整后的共同信息): ", metrics.adjusted_mutual_info_score(labels, estimator.labels_)) print("Calinski Harabasz Score: (方差比指数) ", metrics.calinski_harabasz_score(data, estimator.labels_)) print("Silhouette Score (轮廓分数): ", metrics.silhouette_score(data, estimator.labels_))
def clustering(dm, eps, path): db = DBSCAN(eps=eps, min_samples=1, metric="precomputed").fit(dm) labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) data_df = pandas.read_csv(path + "/temporary/list_to_csv_with_corner_points.csv", sep=";") data_df["cluster"] = labels try: dbs = davies_bouldin_score(dm, labels) #dbs = "1" chs = metrics.calinski_harabasz_score(dm, labels) #chs = 1 silhoutte = metrics.silhouette_score(dm, labels, metric='precomputed') #silhoutte = 2 print("DBscore: ", dbs) print("calsinski: ", chs) print("silhoutte: ", silhoutte) except: dbs = 1 chs = 1 silhoutte = 1 data_df["ausrichtung"] = 1 data_df = data_df.groupby(['cluster', 'ausrichtung' ])['element'].apply(','.join).reset_index() data_df.to_csv(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv", sep=";", header=False, index=False) return data_df, n_clusters_, dbs, chs, silhoutte
def main(args): data_dir = os.path.join(args.root, "data") # get data dir data_path = os.path.join(data_dir, args.inFile) metrics_path = os.path.join(args.root, "metrics", args.outFileName) print('load data from' + data_path) df = pd.read_csv(data_path) pred = df['pred'].tolist() target = df['target'].tolist() eval_result = '*' * 8 + 'Evaluation' + '*' * 10 print('Evaluation:') print(f'Data Size:{df.size}') cal = calinski_harabasz_score(target, pred) eval_result += '\n' + f'calinski_harabasz:{cal}' report = silhouette_score(target, pred, metric='euclidean') eval_result += '\n' + f'silhouette_score:{report}' print(eval_result) if args.txt_report == 'true': txt_file = open(metrics_path + '.txt', 'w') txt_file.write(eval_result) txt_file.close()