def test_pair_confusion_matrix_single_cluster(): # edge case: only one cluster N = 100 clustering1 = np.zeros((N,)) clustering2 = clustering1 expected = np.array([[0, 0], [0, N * (N - 1)]]) assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
def test_pair_confusion_matrix_fully_dispersed(): # edge case: every element is its own cluster N = 100 clustering1 = list(range(N)) clustering2 = clustering1 expected = np.array([[N * (N - 1), 0], [0, 0]]) assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
def print_acc(label_true, label_pred): matrix = pair_confusion_matrix(label_true, label_pred) print(matrix) acc = (matrix[0][0] + matrix[1][1])/sum(sum(matrix)) print(acc) label_true = np.array(label_true) label_pred = np.array(label_pred) print(acc_score(label_true, label_pred))
def get_rand_index_and_f_measure(labels_true, labels_pred, beta=1.): (tn, fp), (fn, tp) = pair_confusion_matrix(labels_true, labels_pred) ri = (tp + tn) / (tp + tn + fp + fn) ari = 2. * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn)) p, r = tp / (tp + fp), tp / (tp + fn) f_beta = (1 + beta**2) * (p * r / ((beta**2) * p + r)) return ri, ari, f_beta
def test_pair_confusion_matrix(): # regular case: different non-trivial clusterings n = 10 N = n ** 2 clustering1 = np.hstack([[i + 1] * n for i in range(n)]) clustering2 = np.hstack([[i + 1] * (n + 1) for i in range(n)])[:N] # basic quadratic implementation expected = np.zeros(shape=(2, 2), dtype=np.int64) for i in range(len(clustering1)): for j in range(len(clustering2)): if i != j: same_cluster_1 = int(clustering1[i] == clustering1[j]) same_cluster_2 = int(clustering2[i] == clustering2[j]) expected[same_cluster_1, same_cluster_2] += 1 assert_array_equal(pair_confusion_matrix(clustering1, clustering2), expected)
def gera_estatisticas(dataset, algo, estastisticas_gerais): caminho_resultados = "dados/mock/" + dataset + "/" path_verdade = caminho_resultados + dataset + "GT.csv" path_particao = caminho_resultados + algo + "_" + dataset + "/" + algo + "_" + dataset + "_labels.csv" contagem_de_pares = ContagemParesOnline(dataset + '_labels_verdadeiro', algo) labels_verdadeiro = readCsv(path_verdade, None).to_numpy() labels_particao = readCsv(path_particao, None).to_numpy() labels_verdadeiro = labels_verdadeiro.flatten() - 1 labels_particao = labels_particao.flatten() sk_rand_score = rand_score(labels_verdadeiro, labels_particao) sk_pair_confusion_matrix = pair_confusion_matrix(labels_verdadeiro, labels_particao) sk_contigencia = contingency_matrix(labels_verdadeiro, labels_particao) rand_index = [] df = [] for index in range(len(labels_verdadeiro)): contagem_de_pares.atualiza(labels_verdadeiro[index], labels_particao[index]) rand_index.append(contagem_de_pares.rand_index) media = sta.mean(rand_index) desvio_padrao = sta.pstdev(rand_index) dados = [contagem_de_pares.rand_index, media, desvio_padrao] df.append(dados) dados_rand_index = pd.DataFrame(list(map(np.ravel, rand_index))) dados_estatististicos = pd.DataFrame(list(map(np.ravel, df))) caminho = "resultados/" + dataset + "/" + algo + "_" + dataset + "/" os.makedirs(os.path.dirname(caminho), exist_ok=True) nomeArquivo = algo + "_" + dataset + "_ri" dados_rand_index.to_csv(caminho + nomeArquivo + '.csv', index=False, header=False) nomeArquivo_dados = algo + "_" + dataset + "_estastisticas" dados_estatististicos.to_csv(caminho + nomeArquivo_dados + '.csv', index=False, header=['rand_index', 'media', 'desvio_padrao']) print("RI Python {} {}: {}".format(dataset, algo, sk_rand_score)) print("N's Python {} {}: \n{}".format(dataset, algo, sk_pair_confusion_matrix)) print("Contigencia's Python {} {}: \n{}".format(dataset, algo, sk_contigencia)) print("RI {} {}: {}".format(dataset, algo, contagem_de_pares.rand_index)) print("N's {} {}: \n{}".format(dataset, algo, contagem_de_pares.matriz_confusao_pares)) print("Contigencia's {} {}: \n{}".format(dataset, algo, contagem_de_pares.matriz_confusao)) print('Média: {}'.format(media)) print('Desvio Padrao: {}'.format(desvio_padrao)) estastisticas_gerais.append([dataset, algo, contagem_de_pares.rand_index, media, desvio_padrao])
dist = self.distance(self.centroids, x) return np.argmin(dist, axis=1) + 1 clf = KMeansClassifier(clusters=3) clf.fit(X.values) predicted = clf.predict(X.values) remap = {3: 3, 2: 1, 1: 2} predicted = np.array(list(map(lambda x: remap[x], predicted))) predicted, y.values.reshape(X.values.shape[0]) draw_clusters(Xr, predicted.reshape(y.values.shape), ['red', 'pink', 'blue']) cm = pair_confusion_matrix(y.values.T[0], predicted) def AdjRand(cm): (TN, FP), (FN, TP) = cm # print(FP, FN, TP, TN, TP + FN, TP + FN + FP + TN) return 2. * (TP * TN - FN * FP) / ((TP + FN) * (FN + TN) + (TP + FP) * (FP + TN)) # return (TP + FN) / (TP + FN + FP + TN) AdjRand(cm) def inter_cluster_distances(labels, distances):