Beispiel #1
0
def cluster(train, val, type, number_of_clusters, plot_folder, classes):
    # todo this should be a class
    if type == "spectral_clustering":
        clustering_model = SpectralClustering(n_clusters=number_of_clusters,
                                              assign_labels="discretize",
                                              random_state=0).fit(
                                                  train["data"])
    elif type == "kmeans":
        clustering_model = KMeans(n_clusters=number_of_clusters,
                                  random_state=0).fit(train["data"])
    else:
        raise NotImplementedError
    # compute metrics
    accuracies = {}
    random_array = np.random.randint(9, size=train["labels"].shape)
    centroids = find_centroids(number_of_clusters, train,
                               clustering_model.labels_)
    test_classifications = cluster_test(val, centroids)
    visualize_clustering(train, clustering_model.labels_, type + "_training",
                         plot_folder, number_of_clusters, centroids)
    visualize_clustering(val, np.asarray(test_classifications),
                         type + "_validation", plot_folder, number_of_clusters,
                         centroids)

    accuracies["random_score"] = homogeneity_score(train["labels"],
                                                   random_array)
    accuracies["v_measure_score"] = v_measure_score(train["labels"],
                                                    clustering_model.labels_)
    accuracies["homogeneity_score"] = homogeneity_score(
        train["labels"], clustering_model.labels_)
    accuracies["completeness_score"] = completeness_score(
        train["labels"], clustering_model.labels_)
    accuracies["silhouette_score"] = silhouette_score(train["data"],
                                                      clustering_model.labels_)
    accuracies["purity_score"], accuracies[
        "contingency_matrix"] = purity_score(train["labels"],
                                             clustering_model.labels_)

    accuracies["v_measure_score_test"] = v_measure_score(
        val["labels"], test_classifications)
    accuracies["homogeneity_score_test"] = homogeneity_score(
        val["labels"], test_classifications)
    accuracies["completeness_score_test"] = completeness_score(
        val["labels"], test_classifications)
    accuracies["silhouette_score_test"] = silhouette_score(
        val["data"], test_classifications)
    accuracies["purity_score_test"], accuracies[
        "contingency_matrix_test"] = purity_score(val["labels"],
                                                  test_classifications)
    return accuracies
Beispiel #2
0
def get_clustering_metrics(train_data,
                           cluster_labels,
                           ground_truth_labels=None):
    clustering_metric_dict = dict({})
    clustering_metric_dict['silhouette_score'] = silhouette_score(
        train_data, cluster_labels, random_state=42)
    clustering_metric_dict[
        'calinski_harabasz_score'] = calinski_harabasz_score(
            train_data, cluster_labels)
    clustering_metric_dict['davies_bouldin_score'] = davies_bouldin_score(
        train_data, cluster_labels)

    if ground_truth_labels is not None:
        clustering_metric_dict['v_measure_score'] = v_measure_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict[
            'fowlkes_mallows_score'] = fowlkes_mallows_score(
                ground_truth_labels, cluster_labels)
        clustering_metric_dict['homogeneity_score'] = homogeneity_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict[
            'normalized_mutual_info_score'] = normalized_mutual_info_score(
                ground_truth_labels, cluster_labels)
        clustering_metric_dict['adjusted_rand_score'] = adjusted_rand_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict['completeness_score'] = completeness_score(
            ground_truth_labels, cluster_labels)

    return clustering_metric_dict
Beispiel #3
0
    def show_result(self, prediction, msg):
        new_line(50)
        print(msg)
        new_line(50)

        real = self.train_labels

        print "Confusion Matrix: "
        print str(confusion_matrix(real, prediction))

        homo_score = homogeneity_score(real, prediction)
        complete_score = completeness_score(real, prediction)
        v_score = v_measure_score(real, prediction)
        rand_score = adjusted_rand_score(real, prediction)
        mutual_info = adjusted_mutual_info_score(real, prediction)

        print("Homogeneity Score: %0.3f" % homo_score)
        print("Completeness Score: %0.3f" % complete_score)
        print("V-measure: %0.3f" % v_score)
        print("Adjusted Rand Score: %0.3f" % rand_score)
        print("Adjusted Mutual Info Score: %0.3f\n" % mutual_info)

        return {
            'Homogeneity': homo_score,
            'Completeness': complete_score,
            'V-measure': v_score,
            'RAND': rand_score,
            'Mutual': mutual_info
        }
Beispiel #4
0
def compute_V_measure(clusters, classes):
    class_list, cluster_list = [], []
    # not_found_id = 1000000
    clustered_but_unaligned = 0
    for read in clusters:
        if read in classes:
            class_list.append(classes[read])
            cluster_list.append(clusters[read])
        else:
            # print("Read was clustered but unaligned:", read)
            clustered_but_unaligned += 1

    # added the unprocessed reads to the measure
    not_clustered = set(classes.keys()) - set(clusters.keys())
    highest_cluster_id = max(clusters.values())
    highest_cluster_id += 1
    for read in not_clustered:
        class_list.append(classes[read])
        cluster_list.append(highest_cluster_id)
        highest_cluster_id += 1

    v_score = v_measure_score(class_list, cluster_list)
    compl_score = completeness_score(class_list, cluster_list)
    homog_score = homogeneity_score(class_list, cluster_list)
    ari = adjusted_rand_score(class_list, cluster_list)

    print("Not included in clustering but aligned:", len(not_clustered))
    print("V:", v_score, "Completeness:", compl_score, "Homogeneity:",
          homog_score)
    print(
        "Nr reads clustered but unaligned (i.e., no class and excluded from V-measure): ",
        clustered_but_unaligned)
    return v_score, compl_score, homog_score, clustered_but_unaligned, ari
Beispiel #5
0
def calc_homogenity_comp_vmeas_training(df, y_train):
    # user_input_df['predict'] = y1

    # confidence_counter = -1
    # for c in candidates:
    #     confidence_counter += 1
    #     adj = c.getSource()
    #     noun = c.getTarget()
    #     candidate_df = user_input_df.loc[(user_input_df['adj'] == adj) & (user_input_df['noun'] == noun)]
    #     print(candidate_df["adj"][confidence_counter])
    #     print(candidate_df["noun"][confidence_counter])
    #     if candidate_df["class"][confidence_counter] != 2:
    #         truelabels.append(candidate_df["class"][confidence_counter])
    #         predictlabels.append(candidate_df["predict"][confidence_counter])
    # print("truelables:",truelabels)
    # print("predictlabels:",predictlabels)
    # homogenity_scr = homogeneity_score(truelabels,predictlabels)
    # vmeasure_scr = v_measure_score(truelabels,predictlabels)
    # completness_scr =completeness_score(truelabels,predictlabels)
    # print("homogenity_scr={},vmeasure_scr={},completness_scr={}".format(homogenity_scr,vmeasure_scr,completness_scr))

    truelabels = df['class']
    predictlabels = y_train
    homogenity_scr = homogeneity_score(truelabels, predictlabels)
    vmeasure_scr = v_measure_score(truelabels, predictlabels)
    completness_scr = completeness_score(truelabels, predictlabels)
    print("truelables:", truelabels)
    print("predictlabels:", predictlabels)
    print("homogenity_scr={},vmeasure_scr={},completness_scr={}".format(
        homogenity_scr, vmeasure_scr, completness_scr))
    def evaluate_bins(self):
        self.labels = []
        newcolors = []
        for bin in self.bins:
            for b in bin:
                self.labels.append(self.bins.index(bin))
                if self.colors != None:                
                    newcolors.append(self.colors[b])
        self.colors = newcolors
                    
        labels = numpy.array(self.labels)
        colors = numpy.array(self.colors)
        points = []
        for bin in self.bins:
            for b in bin:
                start_lat = self.data[b]['trip_start_location'][1]
                start_lon = self.data[b]['trip_start_location'][0]
                end_lat = self.data[b]['trip_end_location'][1]
                end_lon = self.data[b]['trip_end_location'][0]
                path = [start_lat, start_lon, end_lat, end_lon]
                points.append(path)

        if self.colors != None:
            a = metrics.silhouette_score(numpy.array(points), labels)
            b = homogeneity_score(colors, labels)
            c = completeness_score(colors, labels)
            
            print 'number of bins is ' + str(len(self.bins))
            print 'silhouette score is ' + str(a)
            print 'homogeneity is ' + str(b)
            print 'completeness is ' + str(c)
            print 'accuracy is ' + str(((a+1)/2.0 + b + c)/3.0)
def compareAB(A, B, X):
    #measures the similarity of the two assignments, ignoring permutations and with chance normalization
    ars = metrics.adjusted_rand_score(A, B)
    ars_str = '%17.3f' % ars

    # each cluster contains only members of a single class
    hs = homogeneity_score(A, B)
    hs_str = '%17.3f' % hs

    #all members of a given class are assigned to the same cluster
    cs = completeness_score(A, B)
    cs_str = '%17.3f' % cs

    vms = metrics.v_measure_score(A, B)
    vms_str = '%17.3f' % vms

    # geometric mean of the pairwise precision and recall
    fowlkes_mallows_score = metrics.fowlkes_mallows_score(A, B)
    fms_str = '%17.3f' % fowlkes_mallows_score

    sc = metrics.silhouette_score(X, B, metric='euclidean')
    sc_str = '%17.3f' % sc

    my_str = ars_str + "&" + hs_str + "&" + cs_str + "&" + vms_str + "&" + fms_str + "&" + sc_str
    return my_str
Beispiel #8
0
def main():
    m = 2000  # number of points
    n = 50  # Number of dimensions
    k = 30  # Number of latent clusters

    np.random.seed(3)
    X, z_true = draw_points(m, n, k=k)
    show_points(X, z_true, title="True")

    S = fully_connected_similarity(X)

    # Unnormalized spectral clustering
    # A = spectral_clustering(S, k)

    # Normalized spectral clustering according to Shi and Malik (2000)
    # A = spectral_clustering(S, k, normalization=LaplacianNorm.symmetric, generalized_eigenproblem=True)

    # Normalized spectral clustering according to Ng, Jordan, and Weiss (2002)
    A = spectral_clustering(S,
                            k,
                            normalization=LaplacianNorm.symmetric,
                            norm_rows=True)

    show_points(X, A, title="Spectral Clustering")

    complete_score = completeness_score(z_true, A)
    homog_score = homogeneity_score(z_true, A)
    print("Completeness score: %s" % complete_score)
    print("Homogeneity score: %s" % homog_score)
Beispiel #9
0
def print_scores(labels, predicted, svd):
    print "Homogeneity: " + str(homogeneity_score(labels, predicted))
    print "completeness: " + str(completeness_score(labels, predicted))
    print "V-measure: " + str(v_measure_score(labels, predicted))
    print "RAND score: " + str(adjusted_rand_score(labels, predicted))
    print "Mutual Info: " + str(adjusted_mutual_info_score(labels, predicted))
    ret = []
    ret.append(homogeneity_score(labels, predicted))
    ret.append(completeness_score(labels, predicted))
    ret.append(v_measure_score(labels, predicted))
    ret.append(adjusted_rand_score(labels, predicted))
    ret.append(adjusted_mutual_info_score(labels, predicted))
    if svd:
        svd_all.append(ret)
    else:
        nmf_all.append(ret)
    return homogeneity_score(labels, predicted)
 def calculate_scores(self):
   x, c, labels = self.x, self.c, self.labels
   self.v_measure = v_measure_score(c, labels)
   self.complete = completeness_score(c, labels)
   self.adjusted_mutual = adjusted_mutual_info_score(c, labels)
   self.adjusted_rand = adjusted_rand_score(c, labels)
   self.silhouette = silhouette_score(x, c)
   self.purity, self.partial_purity = self.__purity__()
 def calculate_scores(self):
     x, c, labels = self.x, self.c, self.labels
     self.v_measure = v_measure_score(c, labels)
     self.complete = completeness_score(c, labels)
     self.adjusted_mutual = adjusted_mutual_info_score(c, labels)
     self.adjusted_rand = adjusted_rand_score(c, labels)
     self.silhouette = silhouette_score(x, c)
     self.purity, self.partial_purity = self.__purity__()
Beispiel #12
0
def five_measure_scores(label_true, label_pred):
    print("Homogeneity_score = %f" % homogeneity_score(label_true, label_pred))
    print("Completeness_score = %f" %
          completeness_score(label_true, label_pred))
    print("Adjusted_rand_score = %f" %
          adjusted_rand_score(label_true, label_pred))
    print("V_measure_score = %f" % v_measure_score(label_true, label_pred))
    print("Adjusted_mutual_info_score = %f" %
          adjusted_mutual_info_score(label_true, label_pred))
Beispiel #13
0
def v_measure(cluster_labels, true_labels):
    h_score = homogeneity_score(true_labels, cluster_labels)
    c_score = completeness_score(true_labels, cluster_labels)
    v_score = v_measure_score(true_labels, cluster_labels)

    print("Homogeneity Score: %.6f" % h_score)
    print("Completeness Score: %.6f" % c_score)
    print("V Measure Score: %.6f" % v_score)
    return h_score, c_score, v_score
Beispiel #14
0
def print_scores(labels, predicted):
    print "Contingency: "
    print str(confusion_matrix(labels, predicted))

    ret = []
    ret.append(homogeneity_score(labels, predicted))
    ret.append(completeness_score(labels, predicted))
    ret.append(v_measure_score(labels, predicted))
    ret.append(adjusted_rand_score(labels, predicted))
    ret.append(adjusted_mutual_info_score(labels, predicted))

    print "Homogeneity: " + str(homogeneity_score(labels, predicted))
    print "completeness: " + str(completeness_score(labels, predicted))
    print "V-measure: " + str(v_measure_score(labels, predicted))
    print "RAND score: " + str(adjusted_rand_score(labels, predicted))
    print "Mutual Info: " + str(adjusted_mutual_info_score(labels, predicted))

    return ret
def get_landmarking(dataset_name, df):
    start = time.time()
    record = {'dataset': dataset_name.split('.')[0]}
    results = []
    n_samples = int(len(df)*0.1) if len(df) > 400 else min(df.shape[0], 40)
    data = df.sample(n=n_samples, replace=False)
    labels = get_dbscan(data)
    k = len(np.unique(labels))
    labels2 = get_Kmeans(data, k, 40)
    full_tree = DecisionTreeClassifier()
    full_tree.fit(data, labels)
    worst_attr = np.argmin(full_tree.feature_importances_)

    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3)
    best_stump = DecisionTreeClassifier(max_depth=1)
    random_stump = DecisionTreeClassifier(splitter="random", max_depth=1)
    worst_stump = DecisionTreeClassifier(max_depth=1)
    elite_knn = KNeighborsClassifier(n_neighbors=1)
    one_knn = KNeighborsClassifier(n_neighbors=1,
            algorithm="auto",
            weights="uniform",
            p=2,
            metric="minkowski")
    nb = GaussianNB()
    lda = LinearDiscriminantAnalysis()
    best_stump.fit(X_train, y_train)
    random_stump.fit(X_train, y_train)
    worst_stump.fit(X_train.iloc[:, worst_attr].values.reshape(-1, 1), y_train)
    elite_knn.fit(X_train, y_train)
    one_knn.fit(X_train, y_train)
    # lda.fit(X_train, y_train)
    nb.fit(X_train, y_train)

    record['LM1'] = np.log2(df.shape[0])
    record['LM2'] = np.log2(df.shape[1])
    record['LM3'] = accuracy_score(best_stump.predict(X_test), y_test)
    # record['LM4'] = f1_score(best_stump.predict(X_test), y_test, average='weighted')
    record['LM5'] = accuracy_score(random_stump.predict(X_test), y_test)
    # record['LM6'] = f1_score(random_stump.predict(X_test), y_test, average='weighted')
    # record['LM7'] = model.inertia_
    record['LM8'] = accuracy_score(elite_knn.predict(X_test), y_test)
    # record['LM9'] = f1_score(elite_knn.predict(X_test), y_test, average='weighted')
    # record['LM10'] = accuracy_score(lda.predict(X_test), y_test)
    # record['LM11'] = f1_score(lda.predict(X_test), y_test, average='weighted')
    record['LM12'] = accuracy_score(nb.predict(X_test), y_test)
    # record['LM13'] = f1_score(nb.predict(X_test), y_test, average='weighted')
    record['LM14'] = accuracy_score(one_knn.predict(X_test), y_test)
    # record['LM15'] = f1_score(one_knn.predict(X_test), y_test, average='weighted')
    record['LM16'] = accuracy_score(worst_stump.predict(X_test.iloc[:, worst_attr].values.reshape(-1, 1)), y_test)
    # record['LM17'] = f1_score(worst_stump.predict(X_test.iloc[:, worst_attr].values.reshape(-1, 1)), y_test, average='weighted')
    record['LM18'] = adjusted_rand_score(labels, labels2)
    record['LM19'] = adjusted_mutual_info_score(labels, labels2)
    record['LM20'] = completeness_score(labels, labels2)
    record['LM21'] = fowlkes_mallows_score(labels, labels2)

    end = time.time()
    return record, (df.shape[0], df.shape[1], end-start)
Beispiel #16
0
 def calc_performance_score(self, algo_type: str, predicted, y_train):
     homo_score = homogeneity_score(y_train, predicted)
     complete_socre = completeness_score(y_train, predicted)
     adjusted_mute_info_score = adjusted_mutual_info_score(
         y_train, predicted)
     print(algo_type + ' homo_score ' + "{:.2f}".format(homo_score))
     print(algo_type + ' complete_socre ' + "{:.2f}".format(complete_socre))
     print(algo_type + ' adjusted_mute_info_score ' +
           "{:.2f}".format(adjusted_mute_info_score))
Beispiel #17
0
 def get_homogeneity_and_completeness(self, clusters, category):
     labels = getattr(self.scb.nodes,
                      'to_' + category)(range(len(self.scb.nodes)))
     keys = dict()
     for i, label in enumerate(labels):
         if label not in keys:
             keys[label] = len(keys)
         labels[i] = keys[label]
     hs = homogeneity_score(labels, clusters)
     cs = completeness_score(labels, clusters)
     return {'homogeneity': hs, 'completeness': cs}
Beispiel #18
0
def km(clusters, dats, Y):
    NMI = defaultdict(dict)
    INL = defaultdict(dict)
    CMS = defaultdict(dict)
    SIL = defaultdict(dict)
    for i, dat in enumerate(dats):
        for cluster in clusters:
            km = KMeans(n_clusters=cluster, random_state=0).fit(dat)
            cluster_labels = km.labels_
            NMI[i][cluster] = normalized_mutual_info_score(Y, cluster_labels)
            INL[i][cluster] = km.inertia_
            CMS[i][cluster] = completeness_score(Y, cluster_labels)
            SIL[i][cluster] = silhouette_score(dat, cluster_labels)

    plt.plot(clusters, NMI[0].values(), 'bx-', color='C0')
    plt.plot(clusters, NMI[1].values(), 'bx-', color='C1')
    plt.plot(clusters, NMI[2].values(), 'bx-', color='C2')
    plt.plot(clusters, NMI[3].values(), 'bx-', color='C3')
    plt.plot(clusters, NMI[4].values(), 'bx-', color='C4')
    plt.legend(['PCA', 'ICA', 'RP', 'RF', 'Original'])
    plt.xlabel('k')
    plt.title('Normalized Mutual Information on K-Means')
    plt.show()

    plt.plot(clusters, INL[0].values(), 'bx-', color='C0')
    plt.plot(clusters, INL[1].values(), 'bx-', color='C1')
    plt.plot(clusters, INL[2].values(), 'bx-', color='C2')
    plt.plot(clusters, INL[3].values(), 'bx-', color='C3')
    plt.plot(clusters, INL[4].values(), 'bx-', color='C4')
    plt.legend(['PCA', 'ICA', 'RP', 'RF', 'Original'])
    plt.xlabel('k')
    plt.title('Elbow Method on K-Means')
    plt.show()

    plt.plot(clusters, CMS[0].values(), 'bx-', color='C0')
    plt.plot(clusters, CMS[1].values(), 'bx-', color='C1')
    plt.plot(clusters, CMS[2].values(), 'bx-', color='C2')
    plt.plot(clusters, CMS[3].values(), 'bx-', color='C3')
    plt.plot(clusters, CMS[4].values(), 'bx-', color='C4')
    plt.legend(['PCA', 'ICA', 'RP', 'RF', 'Original'])
    plt.xlabel('k')
    plt.title('Completeness on K-Means')
    plt.show()

    plt.plot(clusters, SIL[0].values(), 'bx-', color='C0')
    plt.plot(clusters, SIL[1].values(), 'bx-', color='C1')
    plt.plot(clusters, SIL[2].values(), 'bx-', color='C2')
    plt.plot(clusters, SIL[3].values(), 'bx-', color='C3')
    plt.plot(clusters, SIL[4].values(), 'bx-', color='C4')
    plt.legend(['PCA', 'ICA', 'RP', 'RF', 'Original'])
    plt.xlabel('k')
    plt.title('Silhouette on K-Means')
    plt.show()
Beispiel #19
0
def clusterEvaluation(trueY, fittedY):
    result = dict()
    ## NMI denotes normalized mutual information
    ## ARS denotes adjusted rand score
    ## HS stands for homogeneity_score, 1 means perfect
    ## VM represents v_measure_score ranging [0, 1], 1.0 is perfectly complete labeling
    ## SS represents silhouette_score
    result['NMI'] = normalized_mutual_info_score(trueY, fittedY)
    result['ARS'] = adjusted_rand_score(trueY, fittedY)
    result['HS'] = homogeneity_score(trueY, fittedY)
    result['CS'] = completeness_score(trueY, fittedY)
    result['VM'] = v_measure_score(trueY, fittedY)
    return result
Beispiel #20
0
def print_five_measures(target, predicted):
    print('homogeneity score:')
    print(homogeneity_score(target, predicted))

    print('completeness score:')
    print(completeness_score(target, predicted))

    print('V-measure:')
    print(v_measure_score(target, predicted))

    print('adjusted rand score:')
    print(adjusted_rand_score(target, predicted))

    print('adjuted mutual info score:')
    print(adjusted_mutual_info_score(target, predicted))
Beispiel #21
0
def main():
    start = timeit.default_timer()

    # Chuyển dữ liệu ảnh sang bitmap và scale
    X, Y = load(scale = 50*100 )

    # Chạy Birch và gán nhãn
    pred_label = process(X, labels_num = 32)
    # Tính accuracy với nhãn thật
    accuracy(pred_label, Y, labels_num = 32)
    print("Completeness :", completeness_score(Y, pred_label) * 100, "%")
    print("Homogeneity :", homogeneity_score(Y, pred_label) * 100, "%")

    stop = timeit.default_timer()
    print("Time :", stop - start, "s")
Beispiel #22
0
def evaluate(clusters, typedict):
    """Given the predicted clusters and type dictionary, this function calculates homogeneity, completeness, and V-measure assuming the gold tags are the most frequent tags for each type in the type dict
    input:
        clusters (dict of int:Cluster): Clusters by id
        typedict (dict of str:Word): Word by wordform
    return:
        (float): homogeneity score
        (float): completeness score
        (float): V measure"""
    # The instructor completed this function in 7 line including the return
    golds = []
    preds = []
    # Your code here
    return homogeneity_score(golds, preds), completeness_score(
        golds, preds), v_measure_score(golds, preds, beta=2.0)
Beispiel #23
0
    def __computeKmeansMetrics(self, data, predictedLabels, gsLabels, title, basePath, phase4Results):
        metrics = dict()
        metrics["davies_bouldin_score"] = clusteringMetrics.davies_bouldin_score(data, predictedLabels)
        metrics["adjusted_rand_score"] = clusteringMetrics.adjusted_rand_score(gsLabels, predictedLabels)
        metrics["completeness_score"] = clusteringMetrics.completeness_score(gsLabels, predictedLabels)
        metrics["purity_score"] = purity_score(gsLabels, predictedLabels)
        confusionMatrixMapped = clusteringMappingMetric(predictedLabels, gsLabels)
        confusionMatrix = confusion_matrix(gsLabels, predictedLabels)

        kdf = pd.DataFrame.from_dict(metrics, orient='index', columns=[title])
        phase4Results = phase4Results.join(kdf)

        np.savetxt(basePath / f"{title}_kmeans_confusionMapping.csv", confusionMatrixMapped, delimiter=",", fmt='%i')
        np.savetxt(basePath / f"{title}_kmeans_confusion.csv", confusionMatrix, delimiter=",", fmt='%i')
        return phase4Results
Beispiel #24
0
def k_means_clustering(training_data,
                       target_labels,
                       title='Contingency Matrix',
                       n_clusters=20,
                       random_state=0,
                       max_iter=1000,
                       n_init=30):
    start = time.time()
    km = KMeans(n_clusters=n_clusters,
                random_state=random_state,
                max_iter=max_iter,
                n_init=n_init)
    km.fit(training_data)
    print("Finished clustering in %f seconds" % (time.time() - start))

    cm = contingency_matrix(target_labels, km.labels_)
    # reorder to maximize along diagonal
    rows, cols = linear_sum_assignment(cm, maximize=True)
    new_cm = cm[rows[:, np.newaxis], cols]

    print("Show Contingency Matrix:")
    plot_contingency_table_20(new_cm, title=title)

    print("Report 5 Measures for K-Means Clustering")

    homogeneity = homogeneity_score(target_labels, km.labels_)
    completeness = completeness_score(target_labels, km.labels_)
    v_measure = v_measure_score(target_labels, km.labels_)
    adjusted_rand_index = adjusted_rand_score(target_labels, km.labels_)
    adjusted_mutual_info = adjusted_mutual_info_score(target_labels,
                                                      km.labels_)

    print("Homogeneity Score: %f" % homogeneity)
    print("Completeness Score: %f" % completeness)
    print("V-Measure Score: %f" % v_measure)
    print("Adjusted Rand Index: %f" % adjusted_rand_index)
    print("Adjusted Mutual Information: %f" % adjusted_mutual_info)

    results = {
        "homogeneity": homogeneity,
        "completeness": completeness,
        "v_measure": v_measure,
        "adjusted_rand_index": adjusted_rand_index,
        "adjusted_mutual_info": adjusted_mutual_info
    }

    return results, km
def test():
    model.eval()
    z = model.encode(data.x, data.train_pos_edge_index)

    # Cluster embedded values using k-means.
    kmeans_input = z.cpu().numpy()
    kmeans = KMeans(n_clusters=7, random_state=0).fit(kmeans_input)
    pred = kmeans.predict(kmeans_input)

    labels = data.y.cpu().numpy()
    completeness = completeness_score(labels, pred)
    hm = homogeneity_score(labels, pred)
    nmi = v_measure_score(labels, pred)

    auc, ap = model.test(z, data.test_pos_edge_index, data.test_neg_edge_index)

    return auc, ap, completeness, hm, nmi
 def evaluate(self):
     eval_result_dict = {}
     eval_result_dict['ami'] = adjusted_mutual_info_score(
         self.data['true_y'], self.data['pred_y'])
     eval_result_dict['rand'] = adjusted_rand_score(self.data['true_y'],
                                                    self.data['pred_y'])
     eval_result_dict['comp'] = completeness_score(self.data['true_y'],
                                                   self.data['pred_y'])
     eval_result_dict['fow'] = fowlkes_mallows_score(
         self.data['true_y'], self.data['pred_y'])
     eval_result_dict['hom'] = homogeneity_score(self.data['true_y'],
                                                 self.data['pred_y'])
     eval_result_dict['nmi'] = normalized_mutual_info_score(
         self.data['true_y'], self.data['pred_y'])
     eval_result_dict['v_score'] = v_measure_score(self.data['true_y'],
                                                   self.data['pred_y'])
     return eval_result_dict
Beispiel #27
0
    def compute_result(self, loss, preds, targets, stage):
        # Cluster embedded values using k-means.
        kmeans_input = preds.cpu().numpy()
        kmeans = KMeans(n_clusters=7, random_state=0).fit(kmeans_input)
        pred = kmeans.predict(kmeans_input)

        labels = targets.cpu().numpy()
        completeness = torch.Tensor([completeness_score(labels, pred)])
        hm = torch.Tensor([homogeneity_score(labels, pred)])
        nmi = torch.Tensor([v_measure_score(labels, pred)])

        # auc, ap = model.test(z, data.test_pos_edge_index, data.test_neg_edge_index)
        result = pl.EvalResult(loss)
        result.log(f"{stage}_completeness", completeness, prog_bar=True)
        result.log(f"{stage}_hm", hm, prog_bar=True)
        result.log(f"{stage}_nmi", nmi, prog_bar=True)
        return result
Beispiel #28
0
def compareAB(A, B):
    # measures the similarity of the two assignments, ignoring permutations and with chance normalization
    ars = metrics.adjusted_rand_score(A, B)
    print("adjusted_rand_score " + str(ars))
    # measures the agreement of the two assignments, ignoring permutations
    # amis = metrics.adjusted_mutual_info_score(A, B)
    # print("adjusted_mutual_info_score " + str(amis))
    # each cluster contains only members of a single class
    hs = homogeneity_score(A, B)
    print("homogeneity_score " + str(hs))
    # all members of a given class are assigned to the same cluster
    cs = completeness_score(A, B)
    print("completeness_score " + str(cs))
    vms = metrics.v_measure_score(A, B)
    print("v_measure_score " + str(vms))
    # geometric mean of the pairwise precision and recall
    fowlkes_mallows_score = metrics.fowlkes_mallows_score(A, B)
    print("fowlkes_mallows_score " + str(fowlkes_mallows_score))
Beispiel #29
0
def main():
    start = timeit.default_timer()

    # Chuyển dữ liệu ảnh sang bitmap và scale
    Test_X, Test_Y, Train_X, Train_Y = load(scale=50 * 100)

    print(len(Test_X))
    print(len(Test_Y))
    print(len(Train_X))
    print(len(Train_Y))
    # Chạy Kmeans và gán nhãn
    pred_label = process(Train_X, Train_Y, Test_X)
    # Tính accuracy với nhãn thật
    accuracy(pred_label, Test_Y, labels_num=32)
    print("Completeness :", completeness_score(Test_Y, pred_label) * 100, "%")
    print("Homogeneity :", homogeneity_score(Test_Y, pred_label) * 100, "%")

    stop = timeit.default_timer()
    print("Time :", stop - start, "s")
Beispiel #30
0
 def computeMetrics(self, data, trueLabels, predictedLabels):
     confusionMatrixes = dict()
     metrics = dict()
     for algorithmName, labels in predictedLabels.items():
         metrics[algorithmName] = dict()
         metrics[algorithmName][
             "davies_bouldin_score"] = clusteringMetrics.davies_bouldin_score(
                 data, labels)
         metrics[algorithmName][
             "adjusted_rand_score"] = clusteringMetrics.adjusted_rand_score(
                 trueLabels, labels)
         metrics[algorithmName][
             "completeness_score"] = clusteringMetrics.completeness_score(
                 trueLabels, labels)
         metrics[algorithmName]["purity_score"] = purity_score(
             trueLabels, labels)
         confusionMatrixes[algorithmName] = clusteringMappingMetric(
             labels, trueLabels)
     return metrics, confusionMatrixes
Beispiel #31
0
def dbscanLibrarySklearn():
    X_train, X_test, labels_true, y_test = readData()

    # Compute DBSCAN
    db = DBSCAN(eps=0.6, min_samples=4).fit(X_train)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    print('Estimated number of clusters: %d' % n_clusters_)
    print('Estimated number of noise points: %d' % n_noise_)
    print("Homogeneity: %0.3f" % homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % completeness_score(labels_true, labels))

    # Plot result
    import matplotlib.pyplot as plt

    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    colors = [plt.cm.Spectral(each)
            for each in np.linspace(0, 1, len(unique_labels))]
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = [0, 0, 0, 1]

        class_member_mask = (labels == k)

        xy = X_train[class_member_mask & core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
                markeredgecolor='k', markersize=5)

        xy = X_train[class_member_mask & ~core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
                markeredgecolor='k', markersize=5)

    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.show()
def plot_kmeans_em_v_meansrue(n_clusters):
    row_idx = n_clusters - 5
    v_measure_array = []
    xvalues = all_em_clustering_np_matrix.keys()
    for key in xvalues:
        kmeans_label = all_kmeans_clustering_np_matrix[key][row_idx]
        em_label = all_em_clustering_np_matrix[key][row_idx]
        c = completeness_score(kmeans_label, em_label)
        h = homogeneity_score(kmeans_label, em_label)
        v_measure = 2 * c * h / (c + h)
        v_measure_array.append(v_measure)
    plt.figure(figsize=(8, 8))
    plt.plot(range(len(v_measure_array)), v_measure_array, "o-")
    plt.xticks(np.arange(len(v_measure_array)), xvalues)
    plt.yticks(np.linspace(0.6, 1.0, 9))
    plt.grid(True)
    plt.xlabel("\nDatasets",fontsize=14)
    plt.ylabel("V-measure score",fontsize=14)
    plt.title("Comparison of clustering consistency \nbetween EM and Kmeans on the same dataset",fontsize=14)
    plt.show()
from sklearn.metrics.cluster import homogeneity_score
print(homogeneity_score([0, 0, 1, 1], [1, 1, 0, 0]))
print(homogeneity_score([0, 0, 0, 1, 1, 1], [3, 2, 2, 2, 3, 3]))

from sklearn.metrics.cluster import completeness_score
print(completeness_score([0, 0, 1, 1], [1, 1, 0, 0]))
print(completeness_score([0, 0, 0, 1, 1, 1], [3, 2, 2, 2, 3, 3]))
#X_iso = manifold.Isomap(n_neighbors=5, n_components=2).fit_transform(X_train)
#end = int(round(time.time() * 1000))
#print("--Isomap finished in ", (end-start), "ms--------------")
#print("Done.")


#spectral clustering, fitting and predictions
spectral = cluster.SpectralClustering(n_clusters=10, eigen_solver='arpack', affinity="nearest_neighbors")

#X = spectral.fit(X_iso)
X = spectral.fit(X_spec)

#y_pred = spectral.fit_predict(X_iso)
y_pred = spectral.fit_predict(X_spec)


# clustering evaluation metrics
print(confusion_matrix(y_train, y_pred))
print (completeness_score(y_train, y_pred))

 

with plt.style.context('fivethirtyeight'):       
	plt.title("Spectral embedding & spectral clustering on MNIST")
	plt.scatter(X_spec[:, 0], X_spec[:, 1], c=y_pred, s=50, cmap=plt.cm.get_cmap("jet", 10))
	plt.colorbar(ticks=range(10))
	plt.clim(-0.5, 9.5)
plt.show()


 def check_clusters(self):
     print self.colors
     print 'number of clusters is ' + str(self.clusters)
     print 'silhouette score is ' + str(self.sil)
     print 'homogeneity is ' + str(homogeneity_score(self.colors, self.labels))
     print 'completeness is ' + str(completeness_score(self.colors, self.labels))
def evaluate(colors, labels):
    b = homogeneity_score(colors, labels)
    c = completeness_score(colors, labels)
    logging.debug('homogeneity is %d' % b)
    logging.debug('completeness is %d' % c)
def evaluate(colors, labels):
    b = homogeneity_score(colors, labels)
    c = completeness_score(colors, labels)
    print 'homogeneity is ' + str(b)
    print 'completeness is ' + str(c)