def performance_score(input_values, cluster_indexes, true_indexes):

    try:
        silh_score = metrics.silhouette_score(input_values, cluster_indexes)
        print(' .. Silhouette Coefficient score is {:.2f}'.format(silh_score))
        print(' ... -1: incorrect, 0: overlapping, +1: highly dense clusters.')
    except:
        print(' .. Warning: could not calculate Silhouette Coefficient score.')
        silh_score = -999

    try:
        ch_score = metrics.calinski_harabasz_score(input_values,
                                                   cluster_indexes)
        print(' .. Calinski-Harabasz Index score is {:.2f}'.format(ch_score))
        print(' ... Higher the value better the clusters.')
    except:
        print(
            ' .. Warning: could not calculate Calinski-Harabasz Index score.')
        ch_score = -999

    try:
        db_score = metrics.davies_bouldin_score(input_values, cluster_indexes)
        print(' .. Davies-Bouldin Index score is {:.2f}'.format(db_score))
        print(' ... 0: Lowest possible value, good partitioning.')
    except:
        print(
            ' .. Warning: could not calculate Davies-Bouldin Index Index score.'
        )
        db_score = -999

    try:
        ars = metrics.adjusted_rand_score(true_indexes, cluster_indexes)
        print(' .. adjusted rand score is {:.2f}'.format(ars))
        print(' ... Perfect labeling is scored 1.0 Bounded range [-1, 1]')
    except:
        print(' .. Warning: could not calculate adjusted rand score.')
        ars = -999

    return silh_score, ch_score, db_score, ars
Exemple #2
0
def calKmeans(X):
    kmeans = KMeans(n_clusters=5, random_state=0).fit(X)
    center = kmeans.cluster_centers_
    result = kmeans.labels_
    # 非监督式评估方法
    # 平均轮廓系数
    silhouette_s = silhouette_score(X, kmeans.labels_, metric='euclidean')
    # calinski和harabaz得分
    calinski_harabaz_s = calinski_harabasz_score(X, kmeans.labels_)
    print('silhouette_s: %f \n calinski_harabaz_s: %f' % (silhouette_s, calinski_harabaz_s))

    # print(result)
    temp = sorted(center)
    print(temp)
    # print(center)
    # print(temp)
    index = []
    for i in range(len(temp)):
        print(temp.index(center[i]))
        index.append(temp.index(center[i]))
    # print(index)
    re = {}
    i = 0
    for key in py:
        re[key] = int(result[i])
        i += 1
    # print(re)
    # print(type(re))
    difficulty = [0, 0, 0, 0, 0]
    detail = ["最简单", "较简单", "中等", "较难", "最难"]
    for key in re:
        print(key, end=" ")
        difficulty[index[re[key]]] += 1
        print(detail[index[re[key]]])
    for i in range(len(difficulty)):
        print(detail[i], ":" + str(difficulty[i]))
    # plt.axes(aspect='equal')
    # plt.pie(difficulty, labels=detail, autopct='%.0f%%')
    plt.title("各分数段频数")
 def get_marks(self, data, true_labels, predicted_labels):
     """获取评分,有五种需要知道数据集的实际分类信息,参考readme.txt
     :data: 待分析数据
     :true_labels: 真正分类标签
     :predicted_labels: 模型预测分类标签
     """
     print(30 * '*', "model performance", 30 * '*')
     print("Homogeneity Score         (均一性): ",
           metrics.homogeneity_score(true_labels, predicted_labels))
     print("Completeness Score        (完整性): ",
           metrics.completeness_score(true_labels, predicted_labels))
     print("V-Measure Score           (V量): ",
           metrics.v_measure_score(true_labels, predicted_labels))
     print("Adjusted Rand Score       (调整后兰德指数): ",
           metrics.adjusted_rand_score(true_labels, predicted_labels))
     print(
         "Adjusted Mutual Info Score(调整后的共同信息): ",
         metrics.adjusted_mutual_info_score(true_labels, predicted_labels))
     print("Calinski Harabasz Score:  (方差比指数) ",
           metrics.calinski_harabasz_score(data, predicted_labels))
     print("Silhouette Score          (轮廓分数): ",
           metrics.silhouette_score(data, predicted_labels))
def compute_kmeans_scores(X, n):
    davies_bouldin_scores = []
    distortions = []
    silhouette_scores = []
    calinski_harabasz_scores = []
    times = []

    for i in range(2, n + 1):
        start_time = time.time()
        km = KMeans(n_clusters=i, **km_arguements)
        km.fit(X)
        time_taken = time.time() - start_time
        times.append(time_taken)
        distortions.append(km.inertia_)
        davies_bouldin_scores.append(davies_bouldin_score(X, km.labels_))
        silhouette_avg = silhouette_score(X, km.labels_)
        silhouette_scores.append(silhouette_avg)
        calinski_harabasz_scores.append(calinski_harabasz_score(X, km.labels_))
        print(
            "For n_clusters = {} average silhouette_score: {} time taken: {}s".
            format(i, silhouette_avg, time_taken))
    return distortions, davies_bouldin_scores, silhouette_scores, calinski_harabasz_scores, times
Exemple #5
0
  def clusteringAffinityPropagation(self, X, verbose):
    pref = [-1,-2,-3,-4,-5,-6,-7,-8,-9,-10]
    best_pref = 0
    best_sil = 0
    for i in pref:
      model = AffinityPropagation(preference=i, max_iter=500).fit(X)
      cluster_centers_indices = model.cluster_centers_indices_
      labels = model.labels_

      if len(set(labels)) <= 1 or len(set(labels)) > len(X)-1: continue
      sil = calinski_harabasz_score(X, labels)
      # sil = silhouette_score(X, labels, metric='sqeuclidean')
      if sil > best_sil:
        best_sil = sil
        best_pref = i

    model = AffinityPropagation().fit(X)
    cluster_centers_indices = model.cluster_centers_indices_
    labels = model.labels_

    n_clusters_ = len(cluster_centers_indices)
    if verbose:
      print('Estimated number of clusters: %d' % n_clusters_)
    
      colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
      for k, col in zip(range(n_clusters_), colors):
          class_members = labels == k
          cluster_center = X[cluster_centers_indices[k]]
          plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
          plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                  markeredgecolor='k', markersize=14)
          for x in X[class_members]:
              plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

      plt.title('Estimated number of clusters: %d' % n_clusters_)
      plt.show()

    return model.labels_, n_clusters_
def create_histograms(path, views_folder):
    histograms = {}
    for level in next(os.walk(path))[1]:
        labels = read_results(os.path.join(path, level, 'labels.npy'))
        # graph = read_results(os.path.join(path, level, 'graph.npy'))
        views = load_views(views_folder, level)
        print(level)
        for view in views[level]:
            print(
                'Silhouette', view,
                round(
                    metrics.silhouette_score(views[level][view],
                                             labels,
                                             metric='euclidean'), 4))
            print(
                'Calinski-Harabasz', view,
                round(
                    metrics.calinski_harabasz_score(views[level][view],
                                                    labels), 4))
            print(
                'Davies-Bouldin', view,
                round(metrics.davies_bouldin_score(views[level][view], labels),
                      4))
        d = {}
        for i in labels:
            if i in d:
                d[i] += 1
            else:
                d[i] = 1
        big, small = float('-inf'), float('inf')
        for i in d:
            big = max(big, d[i])
            small = min(small, d[i])
        histograms[level] = [
            d, level + ', n: ' + str(len(labels)) + ', k: ' + str(len(d)) +
            ', Biggest: ' + str(big) + ', Smallest: ' + str(small)
        ]
    return histograms
Exemple #7
0
def spectral_explore_neighbors(features_vector, max_neighbors=30):
    metrics = []
    #Find number neighbors
    if max_neighbors > features_vector.shape[0]:
        max_neighbors = features_vector.shape[0] // 2

    number = range(2, max_neighbors + 1)
    for i in tqdm(number):
        #fix n-cluster = 2, evaluate using calinski and silhouette
        spectral = SpectralClustering(n_clusters=2,
                                      eigen_solver='arpack',
                                      affinity="nearest_neighbors",
                                      assign_labels="discretize",
                                      random_state=0,
                                      n_neighbors=i,
                                      n_jobs=-1).fit((features_vector))
        labels = spectral.labels_
        metrics.append([
            silhouette_score(
                features_vector, labels,
                metric='cosine'),  # silhouette : best 1, worst -1 
            calinski_harabasz_score(features_vector, labels)
        ])  # calinski: the higher the better

    df = pd.DataFrame(metrics,
                      index=number,
                      columns=['silhouette', 'calinski'])
    df.plot(y=['silhouette', 'calinski'],
            subplots=True,
            sharex=True,
            figsize=(10, 12),
            fontsize=14,
            linewidth=2)

    ##best_neighbors: The mean of the indices where the first derivative of the metrics is maximum.
    best_neighbors = int(round((df.diff().idxmax().mean())))
    print("Best number of neighbors is {:d}".format(best_neighbors))
    return best_neighbors
Exemple #8
0
def main():
    # 过滤警告
    warnings.filterwarnings("ignore")

    # 创建“点滴”数据
    # x, y = samples_generator.make_blobs(n_samples=200, centers=2, cluster_std=1, random_state=0)
    # 创建“月牙”数据
    # x, y = samples_generator.make_moons(n_samples=200, noise=0.05, random_state=0)
    # 创建“环形”数据
    x, y = samples_generator.make_circles(n_samples=200, noise=0.05, random_state=0, factor=0.4)

    """
    创建七种聚类算法
    """
    # clusters = cluster.KMeans(2)  # K-means++
    # clusters = cluster.MeanShift()  # 均值迁移
    # clusters = cluster. AgglomerativeClustering(2)  # 层聚类
    # clusters = cluster.AffinityPropagation()  # AP聚类
    # clusters = cluster.SpectralClustering(n_clusters=2, affinity="nearest_neighbors")  # 谱聚类
    # clusters = cluster.DBSCAN(eps=0.55, min_samples=5)  # 密度聚类
    clusters = GaussianMixture(n_components=2)  # 高斯分布

    # 拟合
    _x = clusters.fit_predict(x)

    """
    三种评价方法
    """
    # 1.轮廓系数
    print(metrics.silhouette_score(x, _x))
    # 2.CH指数
    print(metrics.calinski_harabasz_score(x, _x))
    # 3.戴维森堡丁指数
    print(metrics.davies_bouldin_score(x, _x))

    # 绘图
    plt.scatter(x[:, 0], x[:, 1], c=_x, cmap='viridis')
    plt.show()
def HierarchicalClustering(n_clusters, links, affs, clust_col, X, **kwargs):
    # initialize df_scores and df_clust
    df_scores = pd.DataFrame(columns=clust_col,
                             index=pd.MultiIndex.from_product(
                                 [links, affs], names=['links', 'affs']))
    df_clust = pd.DataFrame()
    # go through the possible links and affs
    for aff in affs:
        for link in links:
            # ward can only be used with euclidean
            if link == 'ward' and aff != 'euclidean':
                continue
            print("\t\tlink: {}, aff: {}".format(link, aff))
            # do the clustering
            fit = AgglomerativeClustering(n_clusters=n_clusters,
                                          affinity=aff,
                                          linkage=link)
            # get the predicted labels
            labels = fit.fit_predict(X)
            df_clust = df_clust.append(
                pd.DataFrame({
                    'True': X.index.values,
                    'Predicted': labels,
                    "n_clusters": n_clusters,
                    "link-aff": link + "_" + aff
                }))
            # save the chs and ss -scores
            df_scores.loc[(link, aff),
                          "CHS"] = calinski_harabasz_score(X, labels)
            df_scores.loc[(link, aff), "SS"] = silhouette_score(X, labels)
            # if class_column is specified, save NMI-score
            if len(clust_col) == 3:
                df_scores.loc[(link, aff),
                              "NMI"] = normalized_mutual_info_score(
                                  X.index.values,
                                  labels,
                                  average_method='geometric')
    return df_scores, df_clust
Exemple #10
0
    def clustering(self, ax, K_range, K_offset=0):
        XT = self.data[self.columns_latent_states].values
        XT = StandardScaler().fit_transform(XT)
        K_range = np.array(K_range)

        f = lambda K: AgglomerativeClustering(
            n_clusters=K,
            linkage='ward',
        ).fit_predict(XT)
        CH = np.fromiter((calinski_harabasz_score(XT, f(K)) for K in K_range),
                         dtype=float)
        K_opt = K_range[CH.argmax() + K_offset]

        ax.scatter(K_range,
                   CH,
                   marker='x',
                   color=np.where(K_range == K_opt, 'C1', 'C0'))

        print(f'K_opt = {K_opt}')
        y = f(K_opt)
        print(f'#clusters = {len(set(y) - {-1})}, #-1 = {(y == -1).sum()}')
        self.data['cluster_raw'] = y
        self.data['cluster'] = list(map(str, y))
Exemple #11
0
def bench_EM(estimator, labels, name, data, sample_size, n_clusters, random_state, filename, verbose=False):
    t0 = time()
    estimator.fit(data)
    fit_time = time() - t0

    h**o = 0  # metrics.homogeneity_score(labels, estimator.predict(data))
    comp = 0  # metrics.completeness_score(labels, estimator.predict(data))
    v_meas = metrics.v_measure_score(labels, estimator.predict(data))
    ari = metrics.adjusted_rand_score(labels, estimator.predict(data))
    ami = metrics.adjusted_mutual_info_score(labels, estimator.predict(data))
    fks = 0  # metrics.fowlkes_mallows_score(labels, estimator.predict(data))
    silo = metrics.silhouette_score(data, estimator.predict(data), metric='euclidean', sample_size=sample_size,
                                    random_state=random_state)
    dbs = metrics.davies_bouldin_score(data, estimator.predict(data))
    chs = metrics.calinski_harabasz_score(data, estimator.predict(data))
    aics = 0  # estimator.aic(data)  # Akaike information criterion for the current model on the input X. Lower is better
    bics = estimator.bic(data)  # Bayesian information criterion for the current model on the input X. Lower is Better
    scor = estimator.score(data)
    if verbose:
        print('%-9s\t%d\t%.2fs\t%.2f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%i\t%i\t%i\t%.3f'
              % (name, n_clusters, fit_time, h**o, comp, v_meas, ari, ami, fks, silo, dbs, chs, aics, bics, scor)
              )
    return fit_time, h**o, comp, v_meas, ari, ami, fks, silo, dbs, chs, aics, bics, scor
Exemple #12
0
def measurescore(test_data, y_true, y_pred):
    print(y_pred.shape)
    y_pred = y_pred.reshape(-1, )
    # 无label_true:
    # 1.CH分数 Calinski Harabasz Score, 取值越大越好
    score_ch = metrics.calinski_harabasz_score(test_data, y_pred)

    # 2.轮廓系数(Silhouette Coefficient, 取值-1, 1之间 取值越大越好
    # score_sc = metrics.silhouette_score(test_data, y_pred)
    # 戴维森堡丁指数(DBI)——davies_bouldin_score, 取值越小越好
    score_db = metrics.davies_bouldin_score(test_data, y_pred)

    # label_true:
    # 1.Mutual Information based scores 互信息 [0,1] 取值越大越好
    score_mi = metrics.adjusted_mutual_info_score(y_true, y_pred)
    # 调整兰德系数 (Adjusted Rand index) [-1,1]取值越大越好
    score_adi = metrics.adjusted_rand_score(y_true, y_pred)
    # v_measure_score homogeneity+completeness [0,1] 取值越大越好
    score_vm = metrics.v_measure_score(y_true, y_pred)

    result_score = [score_ch, score_db, score_mi, score_adi, score_vm]

    return result_score
def hierarchical(myData):
    print('**hierarchical start**')
    # Create clusters
    k = 3
    hier = AgglomerativeClustering(n_clusters=k,
                                   affinity='euclidean',
                                   linkage='ward')
    cluster_labels = hier.fit_predict(myData)
    # Determine if the clustering is good
    silhouette_avg = metrics.silhouette_score(myData, cluster_labels)
    calinski_avg = metrics.calinski_harabasz_score(myData, cluster_labels)
    print("For n_clusters =", k, "The average silhouette_score is :",
          silhouette_avg)
    print("For n_clusters =", k, "The average calinski_harabaz_score is :",
          calinski_avg)

    childrens = hier.children_
    # print(childrens)

    plotPCA(myData, cluster_labels, '_hierachical')

    print('**hierarchical end**')
    print()
def evaluation_Score(features, y_pred, output_df, model):
    try:

        num_labels = len(set(y_pred))
        total_samples = len(y_pred)
        if (num_labels == 1 or num_labels == total_samples):
            output_df.loc[model, 'silhouette'] = -1
            output_df.loc[model, 'calinski'] = -1
            output_df.loc[model, 'davies'] = -1

        else:
            output_df.loc[model, 'silhouette'] = metrics.silhouette_score(
                features, y_pred)
            output_df.loc[model, 'calinski'] = metrics.calinski_harabasz_score(
                features, y_pred)
            output_df.loc[model, 'davies'] = metrics.davies_bouldin_score(
                features, y_pred)

    except Exception as e:
        print(e)
        pass

    return output_df
Exemple #15
0
def plot_tsne(ax, curnpz, name, cluster_path='./data_clean/results/'):
    labels = np.load("{}{}/features_conv_v27.npz".format(cluster_path,
                                                         name))['labels'][:, 1]
    conv = np.load("{}{}/features_conv_v27.npz".format(cluster_path,
                                                       name))['feature_vector']

    cal = calinski_harabasz_score(conv, labels)

    X_embedded = TSNE(n_components=2,
                      perplexity=15,
                      learning_rate=10,
                      random_seed=0).fit_transform(conv)

    classes = np.unique(labels)
    cmap = plt.cm.get_cmap('brg', len(classes))
    im = ax.scatter(X_embedded[:, 0], X_embedded[:, 1], c=labels, cmap=cmap)
    ax.figure.colorbar(im, ax=ax, ticks=classes)
    title = "Cluster audio, T-SNE representation"
    ax.set(title=title)

    space = int(len(conv) * 0.1)
    add_direction(ax, X_embedded[:, 0], X_embedded[:, 1], space, 5)
    return ax
 def cal_plot(self):
     y_pred = SpectralClustering(n_clusters=self.K, gamma=self.gamma).fit_predict(self.data)
     class1 = np.array([self.data[i] for i in range(self.N) if y_pred[i] == 0])
     class2 = np.array([self.data[i] for i in range(self.N) if y_pred[i] == 1])
     class3 = np.array([self.data[i] for i in range(self.N) if y_pred[i] == 2])
     plt.plot(class1[:, 2], class1[:, 3], 'co', label="class1")
     plt.plot(class2[:, 2], class2[:, 3], 'yo', label="class2")
     plt.plot(class3[:, 2], class3[:, 3], 'go', label="class3")
     plt.legend(loc="best")
     plt.title("Spectral Clustering dim 2/3")
     plt.show()
     plt.plot(class1[:, 0], class1[:, 1], 'co', label="class1")
     plt.plot(class2[:, 0], class2[:, 1], 'yo', label="class2")
     plt.plot(class3[:, 0], class3[:, 1], 'go', label="class3")
     plt.legend(loc="best")
     plt.title("Spectral Clustering dim 0/1")
     plt.show()
     print("class1", " ", len(class1))
     print("class2", " ", len(class2))
     print("class3", " ", len(class3))
     # print(y_pred)
     print("Calinski-Harabasz Score", metrics.calinski_harabasz_score(self.data, y_pred))
     print("silhouette_scores", metrics.silhouette_score(self.data, y_pred))
Exemple #17
0
 def assessment(self):
     inertias = []
     silhouette = []
     calinski_harabasz = []
     for i in range(3, 9):
         kmeans = KMeans(n_clusters=i, copy_x=True).fit(self.data)
         inertias.append(kmeans.inertia_)
         silhouette.append(silhouette_score(self.data, kmeans.labels_))
         calinski_harabasz.append(
             calinski_harabasz_score(self.data, kmeans.labels_))
     plt.xlabel("k")
     plt.ylabel("inertia")
     X = range(3, 9)
     plt.plot(X, inertias, "o-")
     plt.show()
     plt.xlabel("k")
     plt.ylabel("silhouette_score")
     plt.plot(X, silhouette, "o-")
     plt.show()
     plt.xlabel("k")
     plt.ylabel("calinski_harabasz_score")
     plt.plot(X, calinski_harabasz, "o-")
     plt.show()
Exemple #18
0
def test_gmm_cluster():
    X, y = make_blobs(n_features=13,
                      n_samples=1000,
                      centers=4,
                      cluster_std=[2, 5, 3, 5],
                      random_state=100)
    X = StandardScaler().fit_transform(X)
    # 多维数据,根据聚类指标选择超参数n_clusters
    for i, k in enumerate((2, 3, 4, 5)):
        # cluster = KMeans(n_clusters=k, max_iter=500, random_state=100)
        cluster = GaussianMixture(n_components=k, random_state=100)
        y_pred = cluster.fit_predict(X)
        # 轮廓系数,介于(-1,1),越接近1越好,
        score_si = metrics.silhouette_score(X, y_pred)
        # calinski_harabaz分数,越大越好
        # 一般来说,Silhouette Coefficient要比Calinski-Harabasz Index的结果准确一些,但轮廓系数计算复杂度更高
        score_ch = metrics.calinski_harabasz_score(X, y_pred)
        print(k, score_si, score_ch)

    # 降维后画图看聚类效果
    # cluster = KMeans(n_clusters=4, max_iter=500, random_state=100)
    cluster = GaussianMixture(n_components=4, random_state=100)
    y_pred = cluster.fit_predict(X)
    data = pd.DataFrame(X)
    data['pre_class'] = y_pred
    de = TSNE(n_components=2, random_state=0).fit_transform(X)
    # de = PCA(n_components=2).fit_transform(X)
    # d = de[data['pre_class']==0]
    # plt.plot(d[:, 0], d[:, 1], 'r.')
    # d = de[data['pre_class']==1]
    # plt.plot(d[:, 0], d[:, 1], 'go')
    # d = de[data['pre_class']==2]
    # plt.plot(d[:, 0], d[:, 1], 'b*')
    # d = de[data['pre_class']==3]
    # plt.plot(d[:, 0], d[:, 1], 'y+')
    plt.scatter(de[:, 0], de[:, 1], c=y_pred)
    plt.show()
Exemple #19
0
def best_kmeans(x, k_nums, centers):
    all_measures = []
    all_kmeans = []

    for clusters in k_nums:
        kmeans = KMeans(n_clusters=k_nums[clusters], init=centers[clusters]).fit(x)
        all_kmeans.append(kmeans)

        measure = []

        measure.append(silhouette_score(x, kmeans))
        measure.append(calinski_harabasz_score(x, kmeans))
        measure.append(davies_bouldin_score(x, kmeans))
        measure.append(len(set(kmeans)))
        measure.append(0)


    sil, dav, cal, num_clusters, man_f = give_graph_arrays(all_measures)

    sil=scale(sil)
    dav=scale(dav)
    cal=scale(cal)

    best_index=0
    best_score=sil[0]+cal[0]-dav[0]

    for x in range(len(sil)):
        current_calc=sil[x]+cal[x]-dav[x]

        if(num_clusters[x]==1):
            continue

        if(current_calc>best_score):
            best_index=x
            best_score=current_calc

    return (all_kmeans[best_index], all_measures[best_index])
Exemple #20
0
  def clusteringAgglomerativeClustering (self, X, n_c, verbose):
    best_n_cluster = 0
    best_sil = 0
    if n_c == 0:
      for i in range(1, 11):
        model = AgglomerativeClustering(n_clusters=i).fit(X)
        labels = model.labels_

        if len(set(labels)) <= 1: continue
        sil = calinski_harabasz_score(X, labels)
        # sil = silhouette_score(X, labels, metric='sqeuclidean')
        if sil > best_sil:
          best_sil = sil
          best_n_cluster = i

      model = AgglomerativeClustering(n_clusters=best_n_cluster).fit(X)
      labels = model.labels_

      n_clusters_ = model.n_clusters_
      if verbose:
        print('Estimated number of clusters: %d' % n_clusters_)
    else:
      model = AgglomerativeClustering(n_clusters=n_c).fit(X)
      labels = model.labels_

      n_clusters_ = model.n_clusters_
    if verbose:
      colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
      for k, col in zip(range(n_clusters_), colors):
          class_members = labels == k
          plt.scatter(X[class_members, 0], X[class_members, 1], s=200, c=col)
          # plt.plot(X[class_members, 0], X[class_members, 1], col + '.')

      plt.title('Estimated number of clusters: %d' % n_clusters_)
      plt.show()

    return model.labels_, n_clusters_
Exemple #21
0
def visualise_calinski_harabasz(subfolder, dataset, results):
    indexes = [i for i in range(len(results))]
    names = []
    scores = []
    colors = []
    image_path = f"../../images/{subfolder}/{dataset}"

    if not os.path.exists(image_path):
        os.makedirs(image_path)

    for r in results:
        names.append(r["name"])

        if r['name'].startswith("KUB0462"):
            colors.append('r')
        else:
            colors.append('b')

        items, labels = labelize_clusters(r["clusters"])

        score = metrics.calinski_harabasz_score(items, labels)
        scores.append(score)

        name = r['name']
        print(f'calinski: {name} - {score}')

    fig, ax = plt.subplots()
    width, height = get_size(results)
    fig.set_size_inches(width, height)

    ax.set_xticks(indexes)
    ax.set_xticklabels(names)
    ax.bar(indexes, scores, color=colors)

    plt.margins(0.01)
    plt.title(f"{dataset.upper()}: Calinski Harabasz Score")
    plt.savefig(image_path + "/calinski_harabasz_score.png")
def calculate_extrinsic_metrics(dataset, real_classes, predicted_classes):
    confusion_matrix = matriz_confusion(real_classes, predicted_classes)

    return {
        'Error': medida_error(confusion_matrix),
        'Pureza': medida_pureza(confusion_matrix),
        'F1': medida_f1(confusion_matrix),
        'Entropía': medida_entropia(confusion_matrix),
        'Información mútua': metrics.mutual_info_score(real_classes, predicted_classes),
        'ARI': metrics.adjusted_rand_score(real_classes, predicted_classes),
        'Homogeneidad': metrics.homogeneity_score(real_classes, predicted_classes),
        'Completación': metrics.completeness_score(real_classes, predicted_classes),
        'Medida V': metrics.v_measure_score(real_classes, predicted_classes),
        'Fowlkes-Mallows': metrics.fowlkes_mallows_score(real_classes, predicted_classes),
        'Silhouette': metrics.silhouette_score(dataset, predicted_classes, metric='euclidean'),
        'Calinski-Harabasz': metrics.calinski_harabasz_score(dataset, predicted_classes),
        'Davies-Bouldin': davies_bouldin_score(dataset, predicted_classes),
        'media': (medida_pureza(confusion_matrix) + medida_f1(confusion_matrix) + metrics.mutual_info_score(
            real_classes, predicted_classes) + metrics.adjusted_rand_score(real_classes,
                                                                           predicted_classes) + metrics.homogeneity_score(
            real_classes, predicted_classes) + metrics.completeness_score(real_classes,
                                                                          predicted_classes) + metrics.v_measure_score(
            real_classes, predicted_classes) + metrics.fowlkes_mallows_score(real_classes, predicted_classes)) / 8
    }
Exemple #23
0
def get_marks(estimator, data, name=None):
    """获取评分,有五种需要知道数据集的实际分类信息,有三种不需要,参考readme.txt
    
    :param estimator: 模型
    :param name: 初始方法
    :param data: 特征数据集
    """
    estimator.fit(data)
    print(20 * '*', name, 20 * '*')

    print("Homogeneity Score: ",
          metrics.homogeneity_score(labels, estimator.labels_))
    print("Completeness Score: ",
          metrics.completeness_score(labels, estimator.labels_))
    print("V Measure Score: ",
          metrics.v_measure_score(labels, estimator.labels_))
    print("Adjusted Rand Score: ",
          metrics.adjusted_rand_score(labels, estimator.labels_))
    print("Adjusted Mutual Info Score: ",
          metrics.adjusted_mutual_info_score(labels, estimator.labels_))
    print("Calinski Harabasz Score: ",
          metrics.calinski_harabasz_score(data, estimator.labels_))
    print("Silhouette Score: ",
          metrics.silhouette_score(data, estimator.labels_))
def internalValidation(data, clusters):
    scores = {}
    """
    The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering. 
    Scores around zero indicate overlapping clusters.
    The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster.
    """
    scores['silhouette_score'] = metrics.silhouette_score(data,
                                                          clusters,
                                                          metric='euclidean')
    """
    The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster.
    The score is fast to compute
    """
    scores['calinski_harabaz_score'] = metrics.calinski_harabasz_score(
        data, clusters)
    """
    Zero is the lowest possible score. Values closer to zero indicate a better partition.
    The Davies-Boulding index is generally higher for convex clusters than other concepts of clusters, 
    such as density based clusters like those obtained from DBSCAN.
    """
    scores['davies_bouldin_score'] = metrics.davies_bouldin_score(
        data, clusters)
    return scores
Exemple #25
0
 def train_model():
     kmms = []
     X = G['dist'][G['dist'] > 0].reshape(-1, 1)
     for i in range(1, max_clusters + 1):
         km = KMeans(n_clusters=i)
         yhat = km.fit_predict(X)
         try:
             score = calinski_harabasz_score(X, yhat)
         except ValueError:
             score = np.inf
         label_means = [(label, X[yhat == label].mean())
                        for label in set(yhat)]
         min_mean_label, min_mean = min(label_means, key=lambda x: x[1])
         max_mean_label, max_mean = max(label_means, key=lambda x: x[1])
         kmms.append((i, score, min_mean_label, min_mean, max_mean_label,
                      max_mean, km))
     k, scr, G['min_dist_label'], _, _, max_mean, G['kmm'] = min(
         kmms, key=lambda x: x[1])
     k1mean = kmms[0][3]
     dist_factor = abs(k1mean - max_mean) / max_mean
     if dist_factor < CONFIG["ft_dist_factor"]:
         k, scr, G['min_dist_label'], _, _, _, G['kmm'] = kmms[0]
     logging.debug('k=%s, score=%s, min_dist_label=%s, dist_factor=%s', k,
                   scr, G['min_dist_label'], dist_factor)
def clustererEvaluationMetric(clusterer_container): 
    #print('S3 clustererEvaluationMetric>>')
    import numpy as np
    from sklearn import metrics
    clusterer,X,SX,y,size,clusters_num,metricstring = unPackClustererContainer(clusterer_container)
    # 各集群百分比
    #performanceDict = {'time':t1-t0,'clusters_num':max(clusterer.labels_)+1,'cluster_elements_percent':[0.4,0.2,0.1],'silhouette':0.23,'calinski':342.9}
    cluster_elements = [(clusterer.labels_==i).sum() for i in range(-1,clusters_num)]
    cluster_elements_percent = np.array(cluster_elements)/len(X)
    clusterer_container['performance']['cluster_elements_percent'] = cluster_elements_percent
    # 数学指标
    choicenmetrics = {}
    if 'r' in metricstring:
        m = metrics.adjusted_rand_score(y, clusterer.labels_)
        clusterer_container['performance']['a-randscore'] = m
    if 'm' in metricstring:
        m = metrics.adjusted_mutual_info_score(y,  clusterer.labels_)
        clusterer_container['performance']['a-mutualinfo'] = m
    if 's' in metricstring:
        sample_size = len(X)
        # 聚类数量小于1是没有该指数的
        #print('labels:',clusterer.labels_,'max labels',max(clusterer.labels_),'sample_size:',sample_size)
        try:
            m = -9999 if clusters_num <= 1 else metrics.silhouette_score(X, clusterer.labels_,metric='euclidean',sample_size=sample_size)
        except:
            m = -9999
        clusterer_container['performance']['silhouette'] = m
    if 'c' in metricstring:
        # 聚类数量小于1是没有该指数的
        try:
            m = -9999 if clusters_num <= 1 else metrics.calinski_harabasz_score(X, clusterer.labels_)
        except:
            m = -9999
        clusterer_container['performance']['calinski'] = m
    #print('S3 done.<<<')
    return clusterer_container
Exemple #27
0
 def kmeans(self):
     try:
         if self.dane.empty:
             self.showDialog('Dane nie zostały wczytane')
             return
         daneKMeans = self.dane.copy()
         daneKMeans = daneKMeans.astype(float64)
         if self.ui.comboBoxKMeansMetryka.currentText() in 'cosine':
             daneKMeans = preprocessing.normalize(daneKMeans, norm='l2')
         KM = kmeans(daneKMeans,
                     n_clusters=self.ui.spinBoxKMeansIloscKlastrow.value(),
                     metric=self.ui.comboBoxKMeansMetryka.currentText(),
                     maxiter=1000,
                     verbose=0)
         labelsKM = KM[1]
         if self.ui.radioButtonMiaraSilhouette.isChecked():
             score_silhouette_KM = silhouette_score(
                 self.dane,
                 labelsKM,
                 metric=self.ui.comboBoxKMeansMetryka.currentText())
             score_silhouette_KM = round(score_silhouette_KM, 4)
             self.ui.txtWynikSprawdzenia.setText(str(score_silhouette_KM))
         elif self.ui.radioButtonMiaraDaviesBoudlin.isChecked():
             score_DaviesBoudlin_KM = davies_bouldin_score(
                 self.dane, labelsKM)
             score_DaviesBoudlin_KM = round(score_DaviesBoudlin_KM, 4)
             self.ui.txtWynikSprawdzenia.setText(
                 str(score_DaviesBoudlin_KM))
         elif self.ui.radioButtonMiaraCelinskiHarabasz.isChecked():
             score_CalinskiHarabasz_KM = calinski_harabasz_score(
                 self.dane, labelsKM)
             score_CalinskiHarabasz_KM = round(score_CalinskiHarabasz_KM, 4)
             self.ui.txtWynikSprawdzenia.setText(
                 str(score_CalinskiHarabasz_KM))
     except Exception as inst:
         self.showDialog(inst)
Exemple #28
0
def get_marks(estimator, data, name=None):
    """获取评分,有五种需要知道数据集的实际分类信息,有三种不需要,参考readme.txt
    
    :param estimator: 模型
    :param name: 初始方法
    :param data: 特征数据集
    """
    estimator.fit(data.astype(np.float64))
    print(30 * '*', name, 30 * '*')
    print("       模型及参数: ", estimator)
    print("Homogeneity Score         (均一性): ",
          metrics.homogeneity_score(labels, estimator.labels_))
    print("Completeness Score        (完整性): ",
          metrics.completeness_score(labels, estimator.labels_))
    print("V-Measure Score           (V量): ",
          metrics.v_measure_score(labels, estimator.labels_))
    print("Adjusted Rand Score       (调整后兰德指数): ",
          metrics.adjusted_rand_score(labels, estimator.labels_))
    print("Adjusted Mutual Info Score(调整后的共同信息): ",
          metrics.adjusted_mutual_info_score(labels, estimator.labels_))
    print("Calinski Harabasz Score:  (方差比指数) ",
          metrics.calinski_harabasz_score(data, estimator.labels_))
    print("Silhouette Score          (轮廓分数): ",
          metrics.silhouette_score(data, estimator.labels_))
def clustering(dm, eps, path):
    db = DBSCAN(eps=eps, min_samples=1, metric="precomputed").fit(dm)
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print('Estimated number of clusters: %d' % n_clusters_)
    data_df = pandas.read_csv(path +
                              "/temporary/list_to_csv_with_corner_points.csv",
                              sep=";")
    data_df["cluster"] = labels
    try:
        dbs = davies_bouldin_score(dm, labels)
        #dbs = "1"
        chs = metrics.calinski_harabasz_score(dm, labels)
        #chs = 1
        silhoutte = metrics.silhouette_score(dm, labels, metric='precomputed')
        #silhoutte = 2
        print("DBscore: ", dbs)
        print("calsinski: ", chs)
        print("silhoutte: ", silhoutte)

    except:
        dbs = 1
        chs = 1
        silhoutte = 1

    data_df["ausrichtung"] = 1
    data_df = data_df.groupby(['cluster', 'ausrichtung'
                               ])['element'].apply(','.join).reset_index()
    data_df.to_csv(path +
                   "/temporary/values_clusteredfrom_precomputed_dbscan.csv",
                   sep=";",
                   header=False,
                   index=False)

    return data_df, n_clusters_, dbs, chs, silhoutte
def main(args):
    data_dir = os.path.join(args.root, "data")  # get data dir

    data_path = os.path.join(data_dir, args.inFile)

    metrics_path = os.path.join(args.root, "metrics", args.outFileName)

    print('load data from' + data_path)
    df = pd.read_csv(data_path)
    pred = df['pred'].tolist()
    target = df['target'].tolist()

    eval_result = '*' * 8 + 'Evaluation' + '*' * 10
    print('Evaluation:')
    print(f'Data Size:{df.size}')
    cal = calinski_harabasz_score(target, pred)
    eval_result += '\n' + f'calinski_harabasz:{cal}'
    report = silhouette_score(target, pred, metric='euclidean')
    eval_result += '\n' + f'silhouette_score:{report}'
    print(eval_result)
    if args.txt_report == 'true':
        txt_file = open(metrics_path + '.txt', 'w')
        txt_file.write(eval_result)
        txt_file.close()