Ejemplo n.º 1
0
def test_non_consecutive_labels():
    # regression tests for labels with gaps
    h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 2, 2, 2],
                                                 [0, 1, 0, 1, 2, 2])
    assert_almost_equal(h, 0.67, 2)
    assert_almost_equal(c, 0.42, 2)
    assert_almost_equal(v, 0.52, 2)

    h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1],
                                                 [0, 4, 0, 4, 2, 2])
    assert_almost_equal(h, 0.67, 2)
    assert_almost_equal(c, 0.42, 2)
    assert_almost_equal(v, 0.52, 2)

    ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
    ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
    assert_almost_equal(ari_1, 0.24, 2)
    assert_almost_equal(ari_2, 0.24, 2)

    ri_1 = rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2])
    ri_2 = rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2])
    assert_almost_equal(ri_1, 0.66, 2)
    assert_almost_equal(ri_2, 0.66, 2)
Ejemplo n.º 2
0
def test_rand_score():
    # regular case: different non-trivial clusterings
    clustering1 = [0, 0, 0, 1, 1, 1]
    clustering2 = [0, 1, 0, 1, 2, 2]
    # pair confusion matrix
    D11 = 2 * 2  # ordered pairs (1, 3), (5, 6)
    D10 = 2 * 4  # ordered pairs (1, 2), (2, 3), (4, 5), (4, 6)
    D01 = 2 * 1  # ordered pair (2, 4)
    D00 = 5 * 6 - D11 - D01 - D10  # the remaining pairs
    # rand score
    expected_numerator = D00 + D11
    expected_denominator = D00 + D01 + D10 + D11
    expected = expected_numerator / expected_denominator
    assert_allclose(rand_score(clustering1, clustering2), expected)
Ejemplo n.º 3
0
def calculate_class_ari_ri(
    ground_truth_df,
    prediction_df,
    time_stamp,
    class_list=["GH", "GT", "PL", "CE", "AA", "CBM"],
):
    """Calculate the Adjusted Rand Index (ARI) and Rand Index (RI) of CAZy class predictions.

    :param predictions_df: df of predicted CAZy family annotations from all prediction tools
    :param ground_truths_df: df of CAZy annotations of proteins
    :param time_stamp: str
    :param class_list: list of CAZy class names

    Return predictions_df with the calc ARI and RI added in as new columns (ARI and RI calc
    per protein, and each protein is a separate row).
    """
    ri_scores = []
    ari_scores = []

    row_index = 0
    for row_index in tqdm(
            range(len(prediction_df["Prediction_tool"])),
            desc="Calculating CAZy class ARI and RI",
    ):
        ground_truths_row = ground_truth_df.iloc[row_index]
        predictions_row = prediction_df.iloc[row_index]

        y_true = ground_truths_row[class_list]
        y_pred = predictions_row[class_list]

        ri = rand_score(y_true, y_pred)
        ri_scores.append(ri)

        ari = adjusted_rand_score(y_true, y_pred)
        ari_scores.append(ari)

    prediction_df["Rand_index"] = ri_scores
    prediction_df["Adjusted_Rand_index"] = ari_scores

    return prediction_df
def gera_estatisticas(dataset, algo, estastisticas_gerais):
    caminho_resultados = "dados/mock/" + dataset + "/"
    path_verdade = caminho_resultados + dataset + "GT.csv"
    path_particao =  caminho_resultados + algo + "_" + dataset + "/" + algo + "_" + dataset + "_labels.csv"
    contagem_de_pares = ContagemParesOnline(dataset + '_labels_verdadeiro', algo)
    labels_verdadeiro = readCsv(path_verdade, None).to_numpy()
    labels_particao = readCsv(path_particao, None).to_numpy()
    labels_verdadeiro = labels_verdadeiro.flatten() - 1
    labels_particao = labels_particao.flatten()
    sk_rand_score = rand_score(labels_verdadeiro, labels_particao)
    sk_pair_confusion_matrix = pair_confusion_matrix(labels_verdadeiro, labels_particao)
    sk_contigencia = contingency_matrix(labels_verdadeiro, labels_particao)
    rand_index = []
    df = []
    for index in range(len(labels_verdadeiro)):
        contagem_de_pares.atualiza(labels_verdadeiro[index], labels_particao[index])
        rand_index.append(contagem_de_pares.rand_index)
        media = sta.mean(rand_index)
        desvio_padrao = sta.pstdev(rand_index)
        dados = [contagem_de_pares.rand_index, media, desvio_padrao]
        df.append(dados)
    dados_rand_index = pd.DataFrame(list(map(np.ravel, rand_index)))
    dados_estatististicos = pd.DataFrame(list(map(np.ravel, df)))
    caminho = "resultados/" + dataset + "/" + algo + "_" + dataset + "/"
    os.makedirs(os.path.dirname(caminho), exist_ok=True)
    nomeArquivo =  algo + "_" + dataset + "_ri"
    dados_rand_index.to_csv(caminho + nomeArquivo + '.csv', index=False, header=False)
    nomeArquivo_dados = algo + "_" + dataset + "_estastisticas"
    dados_estatististicos.to_csv(caminho + nomeArquivo_dados + '.csv', index=False, header=['rand_index', 'media', 'desvio_padrao'])
    print("RI Python {} {}: {}".format(dataset, algo, sk_rand_score))
    print("N's Python {} {}: \n{}".format(dataset, algo, sk_pair_confusion_matrix))
    print("Contigencia's Python {} {}: \n{}".format(dataset, algo, sk_contigencia))
    print("RI {} {}: {}".format(dataset, algo, contagem_de_pares.rand_index))
    print("N's {} {}: \n{}".format(dataset, algo, contagem_de_pares.matriz_confusao_pares))
    print("Contigencia's {} {}: \n{}".format(dataset, algo, contagem_de_pares.matriz_confusao))
    print('Média: {}'.format(media))
    print('Desvio Padrao: {}'.format(desvio_padrao))
    estastisticas_gerais.append([dataset, algo, contagem_de_pares.rand_index, media, desvio_padrao])
Ejemplo n.º 5
0
        answer_files = glob(output_dir + '/*.csv')

        for answer_file in answer_files:
            answer_df = pd.read_csv(answer_file)
            last_column = answer_df.columns[-1]
            answer_df = answer_df.sort_values(by=[last_column])

            filename = os.path.splitext(os.path.basename(answer_file))[0]
            filename = filename + ".csv"

            y_pred = answer_df[answer_df.columns[-1]].to_numpy()

            try:
                purity = purity_score(y_true, y_pred)
                nmi = normalized_mutual_info_score(y_true, y_pred)
                rand = rand_score(y_true, y_pred)

                avg = (purity + nmi + rand) / 3

                rank_dict['name'].append(name)
                rank_dict['filename'].append(filename)
                rank_dict['purity'].append(purity)
                rank_dict['nmi'].append(nmi)
                rank_dict['rand_index'].append(rand)
                rank_dict['score'].append(avg)
                rank_dict['note'].append('')
            except:
                rank_dict['name'].append(name)
                rank_dict['filename'].append(filename)
                rank_dict['purity'].append(0)
                rank_dict['nmi'].append(0)
def test_rand_score_edge_cases(clustering1, clustering2):
    # edge case 1: every element is its own cluster
    # edge case 2: only one cluster
    assert_allclose(rand_score(clustering1, clustering2), 1.)
Ejemplo n.º 7
0
# From speed - it is clear that scipy implementation is a lot faster than our own, probably because they implement it in C lang
# 
# From quality - both implementations performs equally

# # Task 4
# 4.1: Plot clustering performance against percant of completed iterations
# 
# 4.2: Split data 90/10 and repeat 4.1
# 
# I'm using rand_score from sklearn because it is essentially a percent of points assigned to a correct cluster

# 4.1
performance = []
for i in range(0, 11):
    assignment = kmeans_cluster_assignment(3, hard_points, tolerance=10e-5, max_iterations=i)
    score = rand_score(hard_clusters, assignment)
    performance.append(score)

plt.plot(performance)
plt.savefig("./4.1-plot.png")

# 4.2
from sklearn.model_selection import train_test_split
x_train, x_test, _, y_test = train_test_split(hard_points, hard_clusters, test_size=0.1, random_state=42)
performance_2 = []
for i in range(11):
    assignment = kmeans_cluster_assignment(3, x_train, max_iterations=i, tolerance=10e-5)
    centroids = []
    for j in [0, 1, 2]:
        cluster_points = x_train[np.argwhere(assignment == j).ravel()]
        centroids.append((