def test_non_consecutive_labels(): # regression tests for labels with gaps h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 2, 2, 2], [0, 1, 0, 1, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2) h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2) ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2]) ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(ari_1, 0.24, 2) assert_almost_equal(ari_2, 0.24, 2) ri_1 = rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2]) ri_2 = rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(ri_1, 0.66, 2) assert_almost_equal(ri_2, 0.66, 2)
def test_rand_score(): # regular case: different non-trivial clusterings clustering1 = [0, 0, 0, 1, 1, 1] clustering2 = [0, 1, 0, 1, 2, 2] # pair confusion matrix D11 = 2 * 2 # ordered pairs (1, 3), (5, 6) D10 = 2 * 4 # ordered pairs (1, 2), (2, 3), (4, 5), (4, 6) D01 = 2 * 1 # ordered pair (2, 4) D00 = 5 * 6 - D11 - D01 - D10 # the remaining pairs # rand score expected_numerator = D00 + D11 expected_denominator = D00 + D01 + D10 + D11 expected = expected_numerator / expected_denominator assert_allclose(rand_score(clustering1, clustering2), expected)
def calculate_class_ari_ri( ground_truth_df, prediction_df, time_stamp, class_list=["GH", "GT", "PL", "CE", "AA", "CBM"], ): """Calculate the Adjusted Rand Index (ARI) and Rand Index (RI) of CAZy class predictions. :param predictions_df: df of predicted CAZy family annotations from all prediction tools :param ground_truths_df: df of CAZy annotations of proteins :param time_stamp: str :param class_list: list of CAZy class names Return predictions_df with the calc ARI and RI added in as new columns (ARI and RI calc per protein, and each protein is a separate row). """ ri_scores = [] ari_scores = [] row_index = 0 for row_index in tqdm( range(len(prediction_df["Prediction_tool"])), desc="Calculating CAZy class ARI and RI", ): ground_truths_row = ground_truth_df.iloc[row_index] predictions_row = prediction_df.iloc[row_index] y_true = ground_truths_row[class_list] y_pred = predictions_row[class_list] ri = rand_score(y_true, y_pred) ri_scores.append(ri) ari = adjusted_rand_score(y_true, y_pred) ari_scores.append(ari) prediction_df["Rand_index"] = ri_scores prediction_df["Adjusted_Rand_index"] = ari_scores return prediction_df
def gera_estatisticas(dataset, algo, estastisticas_gerais): caminho_resultados = "dados/mock/" + dataset + "/" path_verdade = caminho_resultados + dataset + "GT.csv" path_particao = caminho_resultados + algo + "_" + dataset + "/" + algo + "_" + dataset + "_labels.csv" contagem_de_pares = ContagemParesOnline(dataset + '_labels_verdadeiro', algo) labels_verdadeiro = readCsv(path_verdade, None).to_numpy() labels_particao = readCsv(path_particao, None).to_numpy() labels_verdadeiro = labels_verdadeiro.flatten() - 1 labels_particao = labels_particao.flatten() sk_rand_score = rand_score(labels_verdadeiro, labels_particao) sk_pair_confusion_matrix = pair_confusion_matrix(labels_verdadeiro, labels_particao) sk_contigencia = contingency_matrix(labels_verdadeiro, labels_particao) rand_index = [] df = [] for index in range(len(labels_verdadeiro)): contagem_de_pares.atualiza(labels_verdadeiro[index], labels_particao[index]) rand_index.append(contagem_de_pares.rand_index) media = sta.mean(rand_index) desvio_padrao = sta.pstdev(rand_index) dados = [contagem_de_pares.rand_index, media, desvio_padrao] df.append(dados) dados_rand_index = pd.DataFrame(list(map(np.ravel, rand_index))) dados_estatististicos = pd.DataFrame(list(map(np.ravel, df))) caminho = "resultados/" + dataset + "/" + algo + "_" + dataset + "/" os.makedirs(os.path.dirname(caminho), exist_ok=True) nomeArquivo = algo + "_" + dataset + "_ri" dados_rand_index.to_csv(caminho + nomeArquivo + '.csv', index=False, header=False) nomeArquivo_dados = algo + "_" + dataset + "_estastisticas" dados_estatististicos.to_csv(caminho + nomeArquivo_dados + '.csv', index=False, header=['rand_index', 'media', 'desvio_padrao']) print("RI Python {} {}: {}".format(dataset, algo, sk_rand_score)) print("N's Python {} {}: \n{}".format(dataset, algo, sk_pair_confusion_matrix)) print("Contigencia's Python {} {}: \n{}".format(dataset, algo, sk_contigencia)) print("RI {} {}: {}".format(dataset, algo, contagem_de_pares.rand_index)) print("N's {} {}: \n{}".format(dataset, algo, contagem_de_pares.matriz_confusao_pares)) print("Contigencia's {} {}: \n{}".format(dataset, algo, contagem_de_pares.matriz_confusao)) print('Média: {}'.format(media)) print('Desvio Padrao: {}'.format(desvio_padrao)) estastisticas_gerais.append([dataset, algo, contagem_de_pares.rand_index, media, desvio_padrao])
answer_files = glob(output_dir + '/*.csv') for answer_file in answer_files: answer_df = pd.read_csv(answer_file) last_column = answer_df.columns[-1] answer_df = answer_df.sort_values(by=[last_column]) filename = os.path.splitext(os.path.basename(answer_file))[0] filename = filename + ".csv" y_pred = answer_df[answer_df.columns[-1]].to_numpy() try: purity = purity_score(y_true, y_pred) nmi = normalized_mutual_info_score(y_true, y_pred) rand = rand_score(y_true, y_pred) avg = (purity + nmi + rand) / 3 rank_dict['name'].append(name) rank_dict['filename'].append(filename) rank_dict['purity'].append(purity) rank_dict['nmi'].append(nmi) rank_dict['rand_index'].append(rand) rank_dict['score'].append(avg) rank_dict['note'].append('') except: rank_dict['name'].append(name) rank_dict['filename'].append(filename) rank_dict['purity'].append(0) rank_dict['nmi'].append(0)
def test_rand_score_edge_cases(clustering1, clustering2): # edge case 1: every element is its own cluster # edge case 2: only one cluster assert_allclose(rand_score(clustering1, clustering2), 1.)
# From speed - it is clear that scipy implementation is a lot faster than our own, probably because they implement it in C lang # # From quality - both implementations performs equally # # Task 4 # 4.1: Plot clustering performance against percant of completed iterations # # 4.2: Split data 90/10 and repeat 4.1 # # I'm using rand_score from sklearn because it is essentially a percent of points assigned to a correct cluster # 4.1 performance = [] for i in range(0, 11): assignment = kmeans_cluster_assignment(3, hard_points, tolerance=10e-5, max_iterations=i) score = rand_score(hard_clusters, assignment) performance.append(score) plt.plot(performance) plt.savefig("./4.1-plot.png") # 4.2 from sklearn.model_selection import train_test_split x_train, x_test, _, y_test = train_test_split(hard_points, hard_clusters, test_size=0.1, random_state=42) performance_2 = [] for i in range(11): assignment = kmeans_cluster_assignment(3, x_train, max_iterations=i, tolerance=10e-5) centroids = [] for j in [0, 1, 2]: cluster_points = x_train[np.argwhere(assignment == j).ravel()] centroids.append((