def _cluster(params): cls = None method = sh.getConst('method') if method=='kmedoid': assert False # from kmedoid import kmedsoid # cls = kmedoid elif method=='dbscan': from sklearn.cluster import DBSCAN cls = DBSCAN(eps=params['eps'],min_samples=params['min_samples'], metric='precomputed') else: assert False, 'FATAL: unknown cluster method' ## mat = sh.getConst('mat') labels = cls.fit_predict(mat) nLabels = len(set(labels)) ## sil = None; cal = None if (nLabels >= 2)and(nLabels <= len(labels)-1): sil = met.silhouette_score(mat,labels,'precomputed') cal = met.calinski_harabaz_score(mat,labels) perf = dict(silhouette_score=sil,calinski_harabaz_score=cal) return (labels,perf)
def tryOne(label, kDict): data = kDict['data'] kValues = kDict['k'] if 'main' in kDict: main = kDict['main'] try: main() except: traceback.print_exc() for nc in kValues: print('%s[%d]:' % (label, nc)) kmeans = KMeans(n_clusters=nc) try: kmeans.fit(data) except: traceback.print_exc() continue print(kmeans.cluster_centers_) labels = kmeans.labels_ # https://sklearn.org/modules/clustering.html#silhouette-coefficient sscore = metrics.silhouette_score(data, labels) print('Silhouette Coefficient: %f' % sscore) # https://sklearn.org/modules/clustering.html#calinski-harabaz-index chindex = metrics.calinski_harabaz_score(data, labels) print('Calinski-Harabaz Index: %f' % chindex)
def question_two(X): import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn import metrics kmean = KMeans(n_clusters=4, random_state=9) y_pred = kmean.fit_predict(X) data = X.copy() data[u'标签'] = kmean.labels_ for i in range(4): tmp = data[data[u'标签'] == i] print('类簇人数占人数比重',tmp.shape[0] / data.shape[0]) print(u'一级诊断 icd9编码范围:', min(tmp[u'一级诊断']), max(tmp[u'一级诊断'])) print(u'二级诊断 icd9编码范围:', min(tmp[u'二级诊断']), max(tmp[u'三级诊断'])) print(u'三级诊断 icd9编码范围:', min(tmp[u'三级诊断']), max(tmp[u'三级诊断'])) # 绘制三维图 x, y, z = list(map(eval, list(X[u'一级诊断']))), list(map(eval, list(X[u'二级诊断']))), list(map(eval, list(X[u'三级诊断']))) ax = plt.subplot(111, projection='3d') # 创建一个三维的绘图工程 # 将数据点分成三部分画,在颜色上有区分度 ax.scatter(x, y, z, c = y_pred, s=0.1) # 绘制数据点 ax.set_zlabel('diag_3') # 坐标轴 ax.set_ylabel('diag_2') ax.set_xlabel('diag_1') plt.show() print(metrics.calinski_harabaz_score(X, y_pred)) # 寻找最优的k值结果 result = [] for i in range(100)[3:]: y_pred = KMeans(n_clusters=i, random_state=9).fit_predict(X) tmp = metrics.calinski_harabaz_score(X, y_pred) result.append(tmp) print("Calinski-Harabasz Score", tmp) plt.scatter([i for i in range(100)[3:]], result, alpha=0.5) plt.show()
def metrics(pred, labels=None, embeddings=None): from sklearn import metrics print('Estimated number of clusters: {}'.format(len(np.unique(pred)))) if labels is not None: print("Homogeneity: {:0.3f}".format( metrics.homogeneity_score(labels, pred))) print("Completeness: {:0.3f}".format( metrics.completeness_score(labels, pred))) print("V-measure: {:0.3f}".format( metrics.v_measure_score(labels, pred))) print("Adjusted Rand Index: {:0.3f}".format( metrics.adjusted_rand_score(labels, pred))) print("Adjusted Mutual Information: {:0.3f}".format( metrics.adjusted_mutual_info_score(labels, pred))) if embeddings is not None: print("Silhouette Coefficient: {:0.3f}".format( metrics.silhouette_score(embeddings, pred))) print("Calinski-Harabaz Index: {:0.3f}".format( metrics.calinski_harabaz_score(embeddings, pred)))
def cluster_range(X, clusterer, k_start, k_stop): """Calculate the internal validation criteria for different number of clusters Parameters ---------- X : array Design matrix with each row corresponding to a point clusterer : class Contains the random state for replicability k_start, k_stop : int Starting and last number of clusters to perform the internal validation on. Returns ------- cluster : dict dictionary containing the internal validation values for different k's """ # YOUR CODE HERE ys = [] inertias = [] chs = [] scs = [] iidrs = [] for k in range(k_start, k_stop + 1): print('Executing k=', k) kmeans_X = KMeans(n_clusters=k, random_state=clusterer.random_state) y_predict_X = kmeans_X.fit_predict(X) ys.append(y_predict_X) inertias.append(kmeans_X.inertia_) iidrs.append(intra_to_inter(X, y_predict_X, euclidean, 50)) chs.append(calinski_harabaz_score(X, y_predict_X)) scs.append(silhouette_score(X, y_predict_X)) clear_output() cluster = {} cluster['ys'] = ys cluster['inertias'] = inertias cluster['chs'] = chs cluster['iidrs'] = iidrs cluster['scs'] = scs return cluster
def cluster_score(X, y, true_labels=None, _metric="silhouette_score", **kwds): """ A score to compare clusterings. If you know the true labels, use fowlkes_mallows_score. Otherwise use silhouette_score. """ # Unsupervised metrics if _metric == 'calinski_harabaz_score': return metrics.calinski_harabaz_score(X, y) elif _metric == 'silhouette_score': return metrics.silhouette_score(X, y, **kwds) # Supervised metrics elif _metric == 'adjusted_rand_score': return metrics.adjusted_rand_score(true_labels, y) elif _metric == 'fowlkes_mallows_score': return metrics.fowlkes_mallows_score(true_labels, y) else: raise ValueError('Unimplemented metric')
def calculoMedidas(subset, predictions): normalized_set = preprocessing.normalize(subset, norm='l2') meditions = [] for pred in predictions: #CalculamosCalinski-Harabaz metric_CH = metrics.calinski_harabaz_score(normalized_set, pred[1]) #Calculamos Silhouette en el 50% de la població, tric_SC = metrics.silhouette_score(normalized_set, pred[1], metric='euclidean', sample_size=floor(0.5 * len(subset)), random_state=123456) meditions.append((pred[0], metric_CH, tric_SC)) return meditions
def Grid_Birch(param_grid, features): ''' BIRCH 基于密度聚类 Parameters: param_grid: 参数网格 x: 特征 ''' for threshold, branching_factor in zip(param_grid['threshold'], param_grid['branching_factor']): clf = Birch(n_clusters=4, threshold=threshold, branching_factor=branching_factor) clf.fit(features) predicted = clf.predict(features) plot_scatter(features, predicted) print('threshold:', threshold, 'branching_factor:', branching_factor) print('metrics.calinski_harabaz_score:', metrics.calinski_harabaz_score(features, predicted))
def k_means_equi(vectors, prod=4, min_size=100, max_size=500): MIN_SIZE = min_size MAX_SIZE = max_size clusters = np.zeros(vectors.shape[0], dtype=int) counter = pd.Series(collections.Counter(clusters)) last_max = np.inf cluster_centers = dict() while max(counter.values) > MAX_SIZE and last_max != max(counter.values): last_max = max(counter.values) last_n_cluster = max(clusters) i = counter[counter > MAX_SIZE].sort_values(ascending=False).keys()[0] km = KMeans(prod * counter[i] // MAX_SIZE, init="random") reduced_vectors = vectors[list(np.where(clusters == i)[0])] reduced_clusters = km.fit_predict(reduced_vectors) reduced_counter = pd.Series(collections.Counter(reduced_clusters)) while (min(reduced_counter.values) < MIN_SIZE): j = reduced_counter[ reduced_counter < MIN_SIZE].sort_values().keys()[0] clusters_dist = pd.Series([ np.linalg.norm(km.cluster_centers_[j] - km.cluster_centers_[k]) for k in reduced_counter.index ], index=[k for k in reduced_counter.index]) clusters_dist[j] = np.inf k = clusters_dist.sort_values().keys()[0] km.cluster_centers_[k] = ( reduced_counter[k] * km.cluster_centers_[k] + reduced_counter[j] * km.cluster_centers_[j]) / ( reduced_counter[k] + reduced_counter[j]) km.cluster_centers_[j] = np.inf np.place(reduced_clusters, reduced_clusters == j, k) reduced_counter = pd.Series(collections.Counter(reduced_clusters)) clusters[list(np.where( clusters == i)[0])] = last_n_cluster + reduced_clusters + 1 counter = pd.Series(collections.Counter(clusters)) for i in np.unique(reduced_clusters): cluster_centers[last_n_cluster + i] = km.cluster_centers_[i] return (clusters, cluster_centers, counter, calinski_harabaz_score(vectors, clusters))
def cluster(self, eps, min_samples): if self.X is None: raise ValueError db = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1).fit(self.X, sample_weight=self.weight_array) # The DBSCAN algorithm views clusters as areas of high density separated by areas of low density. # Due to this rather generic view, clusters found by DBSCAN can be any shape, # as opposed to k-means which assumes that clusters are convex shaped. # The central component to the DBSCAN is the concept of core samples, which are samples that are in areas # of high density. A cluster is therefore a set of core samples, # each close to each other (measured by some distance measure) and a set of non-core samples that are close # to a core sample (but are not themselves core samples). # There are two parameters to the algorithm, min_samples and eps, which define formally what we mean when we say dense. # Higher min_samples or lower eps indicate higher density necessary to form a cluster. # Cite: # “A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise” # Ester, M., H. P. Kriegel, J. Sander, and X. Xu, # In Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, # Portland, OR, AAAI Press, pp. 226–231. 1996 self.core_samples_mask = np.zeros_like(db.labels_, dtype=bool) self.core_samples_mask[db.core_sample_indices_] = True self.labels = db.labels_ self.n_clusters = len(set( self.labels)) - (1 if -1 in self.labels else 0) try: self.si_score = silhouette_score(self.X, self.labels) # The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) # and the mean nearest-cluster distance (b) for each sample. # The Silhouette Coefficient for a sample is (b - a) / max(a, b). # To clarify, b is the distance between a sample and the nearest cluster that the sample is not a part of. # Note that Silhouette Coefficient is only defined if number of labels is 2 <= n_labels <= n_samples - 1. # Cite: # Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the Interpretation and Validation of Cluster Analysis”. # Computational and Applied Mathematics 20: 53-65. except ValueError: self.si_score = -1 try: self.calinski = calinski_harabaz_score(self.X, self.labels) # The score is defined as ratio between the within-cluster dispersion and the between-cluster dispersion. # Cite: # T.Calinski and J.Harabasz, 1974. “A dendrite method for cluster analysis”.Communications in Statistics except ValueError: self.calinski = 0
def cluster_spectralclustering(n_clusters): """ SpectralClustering聚类算法 :param n_clusters:质心数量 :return: """ data = get_data("../data/feature_vector_pca.csv") spectral = SpectralClustering(n_clusters=n_clusters, gamma=0.01) clusters = spectral.fit_predict(data) # 遍历超参以寻找最优参数 # for index, gamma in enumerate((0.01, 0.1, 1, 10)): # for index2, k in enumerate((15, 20, 25, 30)): # clusters = SpectralClustering(n_clusters=k, gamma=gamma).fit_predict(data) # print("Calinski-Harabasz Score with gamma=", gamma, "n_clusters=", k, "score:", # metrics.calinski_harabaz_score(data, clusters)) print("Calinski-Harabasz Score", metrics.calinski_harabaz_score(data, clusters)) print("每个样本点所属类别索引", clusters) data_labeled_to_csv(clusters, "data/data_labeld_birch.csv")
def BestClusteringGMM(objectives): """ :type objectives: dataframe of objectives """ X = objectives.values num_clusters = np.arange(3, 7) models = [ GMM(n_components=n, covariance_type='full').fit(X) for n in num_clusters ] scores = [metrics.calinski_harabaz_score(X, m.predict(X)) for m in models] # scores2 = [metrics.cluster.silhouette_score(X,m.predict(X)) for m in models ] max_score = np.max(scores) max_index = scores.index(max_score) best_num_cluster = num_clusters[max_index] print("Best number of clusters is {} with the calinski_harabaz_score={}". format(best_num_cluster, round(max_score, 3))) return best_num_cluster
def task1(): X = pd.read_csv("pluton.csv") for numIterations in [1, 2]: kmeans = KMeans(n_clusters=3, random_state=0, max_iter=numIterations).fit(X) colormap = plt.get_cmap('hsv') norm = matplotlib.colors.Normalize(vmin=0, vmax=3) axes = pd.plotting.scatter_matrix(X, color=colormap(norm(kmeans.labels_))) plt.suptitle(f'max_iter = {numIterations}') labels = kmeans.labels_ print(f'max_iter = {numIterations}, n_iter = {kmeans.n_iter_}') print('Silhouette-Score = ', metrics.silhouette_score(X, labels, metric='euclidean')) print('Calinski-Harabaz Index = ', metrics.calinski_harabaz_score(X, labels)) print('Davies-Bouldin Index', metrics.davies_bouldin_score(X, labels), end='\n\n') matplotlib.pyplot.show()
def cluster(values, max_cluster): algorithm = "auto" silhouette_scores = [] calinski_harabaz_scores = [] min_silhouette_avg = 0. cluster_centers = [] cluster_labels = [] for n_clusters in range(2, max_cluster, 1): print(n_clusters) kmeans = KMeans(n_clusters=n_clusters, random_state=10, algorithm=algorithm, n_init=30).fit(values) labels_temp = kmeans.labels_ silhouette_avg = metrics.silhouette_score(values, labels_temp) silhouette_scores.append(silhouette_avg) calinski_harabaz_score = metrics.calinski_harabaz_score( values, labels_temp) calinski_harabaz_scores.append(calinski_harabaz_score) nb_values_over = 0 print("cluster:", n_clusters, "silhouette score:", silhouette_avg, "calinski_harabaz:", calinski_harabaz_score) if min_silhouette_avg < silhouette_avg: min_silhouette_avg = silhouette_avg cluster_centers = kmeans.cluster_centers_ cluster_labels = labels_temp silhouette_samples_values = metrics.silhouette_samples( values, labels_temp) for cluster in range(n_clusters): cluster_values = silhouette_samples_values[cluster_labels == cluster] nb_values_over = nb_values_over + len( cluster_values[np.where(cluster_values > silhouette_avg)]) print("Number of values over average:", nb_values_over, "({:04.1f}%)".format(nb_values_over / len(values) * 100)) return cluster_centers.shape[ 0], cluster_centers, cluster_labels, silhouette_scores, calinski_harabaz_scores
def cluster_range(X, clusterer, k_start, k_stop, actual=None): chs = [] iidrs = [] inertias = [] scs = [] ys = [] amis = [] ars = [] ps = [] for i in range(k_start, k_stop + 1): clusterer2 = clusterer clusterer2.n_clusters = i ys.append(clusterer2.fit_predict(X)) iidrs.append(intra_to_inter(X, ys[-1], euclidean, 50)) chs.append(calinski_harabaz_score(X, ys[-1])) inertias.append(clusterer2.inertia_) scs.append(silhouette_score(X, ys[-1])) keys = ['ys', 'iidrs', 'chs', 'inertias', 'scs'] values = [ys, iidrs, chs, inertias, scs] if actual is not None: for i in ys: ps.append(purity(actual, i)) ars.append(adjusted_rand_score(actual, i)) amis.append(adjusted_mutual_info_score(actual, i)) keys.extend(['ps', 'ars', 'amis']) values.append(ps) values.append(ars) values.append(amis) return dict(zip(keys, values)) else: return dict(zip(keys, values))
def cluster(method, dis_matrix, n, Mode, ef, p): if (method == "KMeans"): M = "KM" #print("KMeans n = "+str(n)) from sklearn.cluster import KMeans #eigen_values, eigen_vectors = np.linalg.eigh(dis_matrix) clusters_KMeans = KMeans(n_clusters=n, init='k-means++').fit(dis_matrix) labels = clusters_KMeans.labels_ output_file = open( "C:/tmp2/result_cluster/" + Mode + "/" + str(p) + "_out_cluster_KMeans_" + str(n) + ".txt", "w") for item in labels: output_file.write("%s\n" % item) output_file.close() elif (method == "Spectral"): M = "SP" #print("SpectralClustering n = "+str(n)) from sklearn.cluster import SpectralClustering cl = SpectralClustering(n_clusters=n, affinity='precomputed') clusters_SpectralClustering = cl.fit(dis_matrix) labels = clusters_SpectralClustering.labels_ output_file = open( "C:/tmp2/result_cluster/" + Mode + "/" + str(p) + "_out_cluster_SpectralClustering_" + str(n) + ".txt", "w") for item in labels: output_file.write("%s\n" % item) output_file.close() else: print("No method found") return -1 from sklearn import metrics ef.write("%d%%\t%s\tn=%d\t%.7f\t%.7f\n" % (p, M, n, metrics.silhouette_score(dis_matrix, labels, metric='euclidean'), metrics.calinski_harabaz_score(dis_matrix, labels))) #ef.write("%d0%% %s n=%d C-Score: %.7f\n" % (p, M, n, metrics.calinski_harabaz_score(dis_matrix, labels))) return 0
def plot_variance_ratio(adata, res_list, X='latent', out='./clustering/', prefix='', rep='latent', save=True): """ res_list (list of float): list of resolution X (str): representation or layer to use {'latent', 'X', 'raw'} """ if 'X_{}'.format(X) in adata.obsm: data = adata.obsm['X_{}'.format(X)] elif X == 'X': data = adata.X else: data = adata.layers[X] fig, ax = plt.subplots() for method in ['Louvain', 'Leiden']: keys = [] resolution = [] for res in res_list: key = prefix + '{}Res{}_{}'.format(method, res, rep) # include resolution with more than one cluster if len(adata.obs[key].cat.categories) > 1: keys.append(key) resolution.append(res) scores = [ calinski_harabaz_score(data, adata.obs[key].values) for key in keys ] ax.plot(resolution, scores, label=method) ax.legend() ax.set_ylabel('Variance Ratio Criterion') ax.set_xlabel('Resolution') ax.set_title(X) if save: fig.savefig(os.path.join(out, f'variance_ratio_criterion_{X}.png')) plt.close() else: plt.show() return
def kmeans(kvalue): dataMat = [] fr = open("E:\\code\\python\\data\\dshl_kmeans.txt") for line in fr.readlines(): curLine = line.strip().split('\t') fltLine = list(map(float, curLine)) # 映射所有的元素为 float(浮点数)类型 dataMat.append(fltLine) km = KMeans(n_clusters=int(kvalue)) # 初始化 km.fit(dataMat) # 拟合 km_preds = km.predict(dataMat).tolist() # 预测 centers = km.cluster_centers_.tolist() # 质心 result_list = [] for i, centerPoint in enumerate(centers): result = {'kIndex': i + 1, 'centerPoint': centerPoint, 'dataCount': 0} result_list.append(result) for km_pred in km_preds: result_list[km_pred][ 'dataCount'] = result_list[km_pred]['dataCount'] + 1 ch_value = metrics.calinski_harabaz_score(dataMat, km_preds) return jsonify({'resultList': result_list, 'chValue': ch_value})
def test_clusterer_calinskiHarabaz(XY): X = XY[0] Y = XY[1] # "_args": [{ "type": "numpy.ndarray", "dtype": "float32"}, # {"type": "numpy.ndarray", "dtype": "int32"}], # "_return": [{"type": "float"}] # we only want to test cosine metric for this example, but it could be a parameter in other cases from sklearn import metrics print('test_clusterer_calinskiHarabaz') min_score = 0 max_score = 200 calinski_harabaz = metrics.calinski_harabaz_score(X, Y) calinski_harabaz = (calinski_harabaz - min_score) / (max_score - min_score) return calinski_harabaz
def evaluate_clusters(X, ids, labels_file): print("Evaluating: "+ labels_file) clusters, label_list = load_clusters(labels_file, ids) # run evaluations # 1. Silhouette Coeffient sc = 0.0 for i in range(100): sc += silhouette_score(X, clusters, sample_size=1000) sc /= 100.0 # 2. Variance Ratio Criterion vrc = calinski_harabaz_score(X, clusters) # 3. dbs = davies_bouldin_score(X, clusters) print("Silhouette, Calinski-Harabaz, Davies-Bouldin") print([sc,vrc,dbs])
def _cluster_plot(self, embedding, labels): silhouette = silhouette_score(embedding.squeeze(), labels) chs = calinski_harabaz_score(embedding.squeeze(), labels) dbs = davies_bouldin_score(embedding.squeeze(), labels) n_labels = len(set(labels)) self.writer.add_scalar(f"silhouette {n_labels}", silhouette, self.step_id) self.writer.add_scalar(f"chs {n_labels}", chs, self.step_id) self.writer.add_scalar(f"dbs {n_labels}", dbs, self.step_id) indices = list(range(len(labels))) random.shuffle(indices) samples_to_plot = indices[:1000] sample_labels = [labels[idx] for idx in samples_to_plot] sample_embedding = embedding[samples_to_plot] pca = PCA(2).fit_transform(sample_embedding.squeeze()) fig, ax = plt.subplots() ax.scatter(pca[:, 0], pca[:, 1], c=sample_labels, cmap="tab20") self.writer.add_figure(f"clustering {n_labels}", fig, self.step_id)
def clustering_in_eval(labels, X, mode='sil'): """ :param mode: sil: Silhouette Coefficient cal: Calinski-Harabaz :param labels: clustering labels :param X: M x N Document-feature maxtrix :return: """ if np.unique(labels).size == 1: print('labels: ', labels) return None if mode == 'sil': value = metrics.silhouette_score(X, labels, metric='euclidean') print("Silhouette Coefficient: ", value) elif mode == 'cal': value = metrics.calinski_harabaz_score(X, labels) print("Calinski-Harabaz: ", value) return value
def cluster_score(X, y_pred): """计算聚类的效果 越大越好 [in] X: array-like, shape(n_samples, n_features), 数据矩阵 y_pred: array-like, shape(n_samples,), 聚类结果 [out] score: double, 聚类效果得分 """ try: if not isinstance(X, np.ndarray): X = X.toarray() score = calinski_harabaz_score(X, y_pred) logging.info("Calinski-Harabasz Score : %.4f" % score) except ValueError as e: score = -1.0 logging.info("caculate score fail.") logging.info(e) except Exception as e: score = -1.0 logging.info("caculate score fail.") logging.info(e) return score
def clustering_centers_D(): model = KMeans(n_clusters=5, init="k-means++", n_init=228, precompute_distances=True, random_state=None, max_iter=300).fit(audio_array_scaled) # labels = model.fit_predict(audio_array_scaled) model.cluster_centers_ centers = np.array(model.cluster_centers_) counter = len(centers) i = 0 for r in range(counter): i += 1 if i <= counter: print({"The center of cluster " + str(r): centers[r]}) print("The value of calinski_harabaz_score is = " + str(metrics.calinski_harabaz_score(audio_array_scaled, labels)))
def dbscan_clustering(data_frame: pd.core.frame.DataFrame): print("DBSCAN clustering") x = data_frame.values min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) normalized_data_frame = pd.DataFrame(x_scaled) DBSCAN_clustering = DBSCAN() cluster_labels = DBSCAN_clustering.fit_predict(normalized_data_frame) calinski_harabaz_avg = calinski_harabaz_score(normalized_data_frame, cluster_labels) print("The average calinski_harabaz_score is :", calinski_harabaz_avg) pca2D = decomposition.PCA(2) # Turn the data into two columns with PCA plot_columns = pca2D.fit_transform(normalized_data_frame) # Plot using a scatter plot and shade by cluster label plt.scatter(x=plot_columns[:, 0], y=plot_columns[:, 1], c=cluster_labels) plt.show()
def MyKmeans10(Im, ImageType, MaxClusters): if ImageType == 'Hyper': r, c = Im.shape[0:2] Im = np.reshape(Im, (r * c, Im.shape[2])) pca = PCA(n_components=0.95) data = pca.fit_transform(Im) # pre_processing if ImageType == 'RGB': r, c = Im.shape[0:2] data = np.zeros((r * c, 3)) # r,c,p = self._weights.shape[0:3] # weights = np.zeros((r*c,p)) n = -1 for i in range(r): for j in range(c): n = n + 1 data[n, :] = Im[i, j, :] metric = [] for numclust in range(2, MaxClusters + 1): kmeans = KMeans(n_clusters=numclust) kmeans.fit(data) labels = kmeans.labels_ metric.append(metrics.calinski_harabaz_score(data, labels)) metric = np.array(metric) index = np.argwhere(metric == max(metric)) index = index + 2 if len(index) > 1: index = min(index) index = int(index) kmeans1 = KMeans(n_clusters=index) kmeans1.fit(data) labels1 = kmeans1.labels_ + 1 ClusterIm = np.zeros((r, c)) n2 = -1 for i2 in range(r): for j2 in range(c): n2 = n2 + 1 ClusterIm[i2, j2] = labels1[n2] return ClusterIm
def cluster_embeddings(word_vectors, cluster_num, batch_size, init, max_iter, max_no_improvement, verbose): """Cluster word embedding vectors, and evaluate the variance""" kmeans_handler = MiniBatchKMeans(n_clusters=cluster_num, batch_size=batch_size, init=init, max_iter=max_iter, max_no_improvement=max_no_improvement, verbose=verbose) # get the cluster indices of each embedding indices = kmeans_handler.fit_predict(word_vectors.vectors) # create a dictionary that maps each word with its cluster number word_cluster_map = dict(zip(word_vectors.index2word, indices)) # calculate the ratio between intra- and inter-cluster variance (the lower, the better) variance_ratio = calinski_harabaz_score(word_vectors.vectors, indices) # get the lists of words forming each cluster word_clusters = [list() for _ in range(cluster_num)] for word in word_cluster_map.keys(): word_clusters[word_cluster_map[word]].append(word) return word_clusters, variance_ratio
def kmeans_blocks(x, box_num=3): ''' 聚类分箱 :param x: dtype=[] :param bins: 分箱数量 :return: 箱边界 ''' len_clocks = min(box_num, len(x), len(set(x)) + 1) if len_clocks <= 1: return [-np.inf, np.inf] X = np.array(x).reshape([-1, 1]) km = KMeans(n_clusters=len_clocks - 1, random_state=666) y_pre = km.fit_predict(X) # 使用Calinski-Harabasz Index评估的聚类分数: 分数越高,表示聚类的效果越好 if km.cluster_centers_.size > 1: # 聚类类别大于1 kmeans_score = calinski_harabaz_score(X, y_pre) print("聚类效果评分:{}".format(kmeans_score)) tb = km.cluster_centers_.reshape([-1]) tb.sort() blocks = np.concatenate([[-np.inf], tb, [np.inf]]) return blocks.tolist()
def CHI(points, labelsPred): """ Calinski and Harabasz Index measure Parameters ---------- point : ndarray The point that we need to find its Nearest labelPred : list A list of predicted labels for the points for each chroomosome Returns ------- float fitness: The fitness value """ global fitnessFunc fitnessFunc = "CH" ch = metrics.calinski_harabaz_score(points, labelsPred) fitness = 1 / ch return fitness
def get_hdbsan_best_eps_minsamples(df_features, max_num_members): calinski_score_dict = {} for i in range(2, max_num_members): min_samples = i min_cluster_size = min_samples estimator = get_model('hdbscan', min_samples, df_features) # build clustering estimator estimator.fit(df_features) labels = estimator.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) try: calinski_score = metrics.calinski_harabaz_score( df_features, labels) except: calinski_score = 0 print('min_cluster_size', min_cluster_size, 'minsample', min_samples, 'n_cluster: ', n_clusters_, 'calinski_score: ', calinski_score) calinski_score_dict[str(int(min_samples))] = calinski_score return get_highest_score(calinski_score_dict)
def kmeans(normalizedDataFrame, df_new): ### set kmeans for k=5 k = 5 kmeans = KMeans(n_clusters=k) # Cluster and put every row into a cluster cluster_labels = kmeans.fit_predict( normalizedDataFrame) # 100 numbers which divided to k groups # Use calinski_harabaz score to access the quality of data calinski_avg = metrics.calinski_harabaz_score(normalizedDataFrame, cluster_labels) print("For n_clusters = ", str(k), " the average calinski_harabaz_score is :", calinski_avg) # PCA # Let's convert our high dimensional data to 2 dimensions pca2D = decomposition.PCA(2) # Turn the data into two columns with PCA plot_columns = pca2D.fit_transform(normalizedDataFrame) # Plot using a scatter plot and shade by cluster label plt.scatter(x=plot_columns[:, 0], y=plot_columns[:, 1], c=cluster_labels) plt.title("For kmeans method,PCA for k=" + str(k)) plt.savefig("For kmeans method, PCA for k=" + str(k)) plt.show() plt.clf() plt.close() ## Get labels for each sector print("\n**************Kmeans**************\n") df_new['labels'] = cluster_labels dfa = pd.concat([df_new['sector'], df_new['labels']], axis=1) for i in range(0, 5): dfa0 = dfa.loc[dfa['labels'] == i, :] print("For cluster", i) print(dfa0['sector'].value_counts())
def ward(normalizedDataFrame, df_new): Z = linkage(normalizedDataFrame, method='ward', metric='euclidean') # we set k=5 k = 5 labels_1 = fcluster(Z, t=k, criterion='maxclust') # Use calinski_harabaz score to access the quality of data calinski_avg = metrics.calinski_harabaz_score(normalizedDataFrame, labels_1) print("For n_clusters = ", str(k), " the average calinski_harabaz_score is :", calinski_avg) ##### # PCA # Let's convert our high dimensional data to 2 dimensions # using PCA pca2D = decomposition.PCA(2) # Turn the data into two columns with PCA plot_columns = pca2D.fit_transform(normalizedDataFrame) # Plot using a scatter plot and shade by cluster label plt.scatter(x=plot_columns[:, 0], y=plot_columns[:, 1], c=labels_1) plt.title("For hierarchical method,PCA for k=" + str(k)) plt.savefig("For hierarchical method,PCA for k=" + str(k)) plt.show() plt.clf() plt.close() ## Get labels for each sector print("\n**************Hierarchical**************\n") df_new['labels'] = labels_1 dfa = pd.concat([df_new['sector'], df_new['labels']], axis=1) for i in range(1, 6): dfa0 = dfa.loc[dfa['labels'] == i, :] print("For cluster", i) print(dfa0['sector'].value_counts())
def find_optimal_K(training_data, min_number_of_clusters, max_number_of_clusters): max_calinski_harabaz = {'num_clusters': 0, 'score': -1} for i in range(min_number_of_clusters, max_number_of_clusters + 1): cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(training_data.T, i, 5, error=0.005, maxiter=1000, init=None, seed=1) cluster_membership = np.argmax(u, axis=0) score = calinski_harabaz_score(training_data, cluster_membership) if score >= max_calinski_harabaz['score']: max_calinski_harabaz['num_clusters'] = i max_calinski_harabaz['score'] = score return max_calinski_harabaz['num_clusters']
def main(): if len(sys.argv)!=4: print "Usage: python kmedoid.py [e|ic|gpcr|nr] [dataDir] [outputDir]" return dataPath = sys.argv[1] dataset = sys.argv[2] outPath = sys.argv[3] # Load file print "Preparing data" _,comList,proList = yam.loadComProConnMat(dataset,dataPath+"/Adjacency") kernel = yam.loadKernel(dataset,dataPath) nComp = len(comList) nProtein = len(proList) comSimMat = np.zeros((nComp,nComp), dtype=float) proSimMat = np.zeros((nProtein,nProtein), dtype=float) for row,i in enumerate(comList): for col,j in enumerate(comList): comSimMat[row][col] = kernel[(i,j)] for row,i in enumerate(proList): for col,j in enumerate(proList): proSimMat[row][col] = kernel[(i,j)] # convert similarity matrix to distance Matrix proDisMat = simToDis(proSimMat) comDisMat = simToDis(comSimMat) print "Clustering" proMedoid,proClust = kMedoids(len(proList)/2, proDisMat) comMedoid,comClust = kMedoids(len(comList)/2, comDisMat) # Take each label for each sample comLabelList = np.zeros((nComp)) proLabelList = np.zeros((nProtein)) proMetaClust = dict() comMetaClust = dict() for lab in proClust: meta = [] for idx in proClust[lab]: meta.append(proList[idx]) proLabelList[idx] = lab proMetaClust[lab] = meta for lab in comClust: meta = [] for idx in comClust[lab]: meta.append(comList[idx]) comLabelList[idx] = lab comMetaClust[lab] = meta print "Evaluation" comSilhouette = met.silhouette_score(comDisMat,comLabelList,metric="precomputed") proSilhouette = met.silhouette_score(proDisMat,proLabelList,metric="precomputed") comCalinskiHarabaz = met.calinski_harabaz_score(comDisMat,comLabelList) proCalinskiHarabaz = met.calinski_harabaz_score(proDisMat,proLabelList) print ("Silhouette score :\nCompound cluster = "+str(comSilhouette)+ ",Protein cluster = "+str(proSilhouette)) print ("Calinski Harabaz score :\nCompound cluster = "+str(comCalinskiHarabaz)+ ", Protein cluster = "+str(proCalinskiHarabaz)) print "Writing Output" perf = {'silhouette_score_':{'compound':comSilhouette,'protein':proSilhouette}, 'calinski_harabaz_score':{'compound':comCalinskiHarabaz,'protein': proCalinskiHarabaz}} with open(outPath+"/perf_medoid_"+dataset+".json",'w') as f: json.dump(perf,f, indent=2, sort_keys=True) with open(outPath+"/cluster_medoid_com_"+dataset+".json",'w') as f: json.dump(comMetaClust,f, indent=2, sort_keys=True) with open(outPath+"/cluster_medoid_pro_"+dataset+".json",'w') as f: json.dump(proMetaClust,f, indent=2, sort_keys=True)
def run(self): """ Process data """ data = copy.copy(self.indata['Raster']) self.update_vars() no_clust = range(self.min_cluster, self.max_cluster+1) self.reportback('Cluster analysis started') # Section to deal with different bands having different null values. masktmp = data[0].data.mask for i in data: masktmp += i.data.mask for i, _ in enumerate(data): data[i].data.mask = masktmp X = np.array([i.data.compressed() for i in data]).T if self.radiobutton_sscale.isChecked(): X = skp.StandardScaler().fit_transform(X) elif self.radiobutton_rscale.isChecked(): X = skp.RobustScaler().fit_transform(X) dat_out = [] for i in self.pbar.iter(no_clust): if self.cltype != 'DBSCAN': self.reportback('Number of Clusters:'+str(i)) elif i > no_clust[0]: continue if self.cltype == 'k-means': # cfit = skc.KMeans(n_clusters=i, tol=self.tol, # max_iter=self.max_iter).fit(X) cfit = skc.MiniBatchKMeans(n_clusters=i, tol=self.tol, max_iter=self.max_iter).fit(X) elif self.cltype == 'DBSCAN': cfit = skc.DBSCAN(eps=self.eps, min_samples=self.min_samples).fit(X) elif self.cltype == 'Birch': cfit = skc.Birch(n_clusters=i, threshold=self.bthres, branching_factor=self.branchfac).fit(X) dat_out.append(Clust()) for k in data: dat_out[-1].input_type.append(k.dataid) zonal = np.ma.masked_all(data[0].data.shape) alpha = (data[0].data.mask == 0) zonal[alpha == 1] = cfit.labels_ dat_out[-1].data = zonal dat_out[-1].nullvalue = zonal.fill_value dat_out[-1].no_clusters = i dat_out[-1].center = np.zeros([i, len(data)]) dat_out[-1].center_std = np.zeros([i, len(data)]) if cfit.labels_.max() > -1: dat_out[-1].vrc = skm.calinski_harabaz_score(X, cfit.labels_) if self.cltype == 'k-means': dat_out[-1].center = np.array(cfit.cluster_centers_) self.log = ("Cluster complete" + ' (' + self.cltype+')') for i in dat_out: i.tlx = data[0].tlx i.tly = data[0].tly i.xdim = data[0].xdim i.ydim = data[0].ydim i.nrofbands = 1 i.dataid = 'Clusters: '+str(i.no_clusters) if self.cltype == 'DBSCAN': i.dataid = 'Clusters: '+str(int(i.data.max()+1)) i.rows = data[0].rows i.cols = data[0].cols i.nullvalue = data[0].nullvalue self.reportback("Cluster complete" + ' ('+self.cltype + ' ' + ')') for i in dat_out: i.data += 1 i.data = i.data.astype(np.uint8) self.outdata['Cluster'] = dat_out self.outdata['Raster'] = self.indata['Raster'] return True
def compute_calinski_harabaz_score(estimator, X, y=None): return 0.0 if estimator.n_clusters == 1 else calinski_harabaz_score(X, estimator.fit_predict(X))
transf_list = arguments.sc3_transf.split(",") print('\nThere are {0} transformations given.'.format(len(transf_list))) for ts in transf_list: print('- Adding transformation {0}'.format(ts)) trg_clustering.add_dimred_calculation(partial(sc.transformations, components=max_pca_comp, method=ts)) trg_clustering.add_intermediate_clustering(partial(sc.intermediate_kmeans_clustering, k=trg_k)) trg_clustering.set_build_consensus_matrix(sc.build_consensus_matrix) trg_clustering.set_consensus_clustering(partial(sc.consensus_clustering, n_components=trg_k)) trg_clustering.apply() # -------------------------------------------------- # 4. EVALUATE CLUSTER ASSIGNMENT # -------------------------------------------------- print('\nUnsupervised evaluation:') accs[0, j, i] = metrics.calinski_harabaz_score( trg_clustering.pp_data.T, trg_clustering.cluster_labels) accs[1, j, i] = metrics.silhouette_score( trg_clustering.pp_data.T, trg_clustering.cluster_labels, metric='euclidean') accs[2, j, i] = metrics.silhouette_score( trg_clustering.pp_data.T, trg_clustering.cluster_labels, metric='correlation') accs[3, j, i] = metrics.silhouette_score( trg_clustering.pp_data.T, trg_clustering.cluster_labels, metric='jaccard') print ' -Calinski-Harabaz : ', accs[0, j, i] print ' -Silhouette (euc) : ', accs[1, j, i] print ' -Silhouette (corr): ', accs[2, j, i] print ' -Silhouette (jacc): ', accs[3, j, i] if trg_labels is not None: print('\nSupervised evaluation:') accs[4, j, i] = metrics.adjusted_rand_score( trg_labels[trg_clustering.remain_cell_inds], trg_clustering.cluster_labels) print ' -ARI: ', accs[4, j, i]
print("2nd component: ", pca.components_[1]) # In[51]: from sklearn.metrics import silhouette_samples, silhouette_score Resultk=[0]*9 ResultC=[0]*9 for k in [2,3,4,5,6,7,8,9,10]: kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10) cluster_labels = kmeans.fit_predict(reduced_data[:,:pca_comps]) #cluster_labels=kmeans.fit(reduced_data[:,:2]) #silhouette_avg = silhouette_score(reduced_data[:,:pca_comps], cluster_labels) #silhouette_avg = silhouette_score(reduced_data[:,:2], cluster_labels) #print("For n_clusters =", k, "The average silhouette_score is :", silhouette_avg) calinski_harabaz_score_avg = metrics.calinski_harabaz_score(reduced_data[:,:pca_comps], cluster_labels) #calinski_harabaz_score_avg = metrics.calinski_harabaz_score(reduced_data[:,:2], cluster_labels) print("For n_clusters =", k," the average metrics.calinski_harabaz_score is :", calinski_harabaz_score_avg) Resultk[k-2]=k ResultC[k-2]=calinski_harabaz_score_avg plt.plot(Resultk,ResultC,'r*-.') # In[56]: n_clusters = 2 kmeans = KMeans(init='k-means++', n_clusters=2, n_init=10) pca = PCA().fit(data) cluster_labels = kmeans.fit_predict(reduced_data[:,:2]) plt.figure() plt.plot(range(len(pca.explained_variance_ratio_)), np.cumsum(pca.explained_variance_ratio_))
from sklearn import metrics km2_silc = metrics.silhouette_score(X, km2_labels, metric='euclidean') km5_silc = metrics.silhouette_score(X, km5_labels, metric='euclidean') print('Silhouette Coefficient for num clusters=2: ', km2_silc) print('Silhouette Coefficient for num clusters=5: ', km5_silc) # ## Calinski-Harabaz Index # In[30]: km2_chi = metrics.calinski_harabaz_score(X, km2_labels) km5_chi = metrics.calinski_harabaz_score(X, km5_labels) print('Calinski-Harabaz Index for num clusters=2: ', km2_chi) print('Calinski-Harabaz Index for num clusters=5: ', km5_chi) # # Model tuning # ## Build and Evaluate Default Model # In[31]: from sklearn.model_selection import train_test_split from sklearn.svm import SVC
from sklearn import metrics X = np.array([],dtype = np.float) n_cls = 4 fo = open('anchor_ratio.txt','r') for line in fo: if float(line) < 1.0 :#and float(line) > 3.0: X = np.append(X,np.float(line)) X = X.reshape(-1,1) #print(X) y_pred = KMeans(n_clusters=n_cls).fit_predict(X) score = metrics.calinski_harabaz_score(X,y_pred) print(y_pred.shape) #print('score = {}'.format(score)) for i in range(0,n_cls): cls_index = np.where(y_pred == i) X_value = X[cls_index] #X_mean = np.sum(X_value)/len(X_value) X_mean = np.mean(X_value) #X_med = np.median(X_value) #print('x index {}'.format(i)) #print(cls_index) #print('x value {}'.format(i)) #print(X_value) print('{}: x len = {:8f}, meam = {:8f}'.format(i,len(X_value),X_mean))