if (final_k_results['k'] == clusters):
            count_correct = count_correct + 1

            #alignment with the chosen gap
            results = main_algorithm(df_encoded, final_k_results['gap'], T, s,
                                     0)

            #convert similarity matrix into distance matrix
            results['score'] = convert_to_distance_matrix(results['score'])

            #hierarchical clustering
            Z = hierarchical_clustering(results['score'], method, gap)

            #compute clustering indices between partition_generated and partition_found
            c_assignments_found = cut_tree(Z, final_k_results['k'])
            partition_found = cluster_indices(c_assignments_found,
                                              df_encoded.index.tolist())
            computed_indexes = cluster_external_index(partition_generated,
                                                      partition_found)
            final_statistics['Rand'].append(computed_indexes[0])
            final_statistics['Adjusted Rand'].append(computed_indexes[1])
            final_statistics['Fowlkes and Mallows'].append(computed_indexes[2])
            final_statistics['Jaccard'].append(computed_indexes[3])
            final_statistics['Adjusted Wallace'].append(computed_indexes[4])

    if (count_correct > 1):

        final_avgs_statistics[n_sequences]['Rand'] = mean(
            final_statistics['Rand'])
        final_avgs_statistics[n_sequences]['Adjusted Rand'] = mean(
            final_statistics['Adjusted Rand'])
        final_avgs_statistics[n_sequences]['Fowlkes and Mallows'] = mean(
Beispiel #2
0
def cluster_validation(M,method,k,partition_found,df_encoded,results,gap,Tp):
    
    #write cluster stability analysis on a pdf page
    pp = PdfPages('cluster_stability_analysis.pdf')
    
    #dictionary to store all computed indexes for each cluster
    dicio_cluster_validation = {k:{} for k in range(1,k+1)}
    for k in range(1,k+1):
        dicio_cluster_validation[k]['jaccard'] = []
        dicio_cluster_validation[k]['dice'] = []
        dicio_cluster_validation[k]['asymmetric'] = []
        

    #assess cluster stability for K=k that was the number of clusters chosen
    for i in range(M):
        # sampling rows of the original data
        idx = np.random.choice(len(df_encoded), int((3/4)*len(df_encoded)), replace = False)
        idx = np.sort(idx)
        #get all the possible combinations between the sampled patients
        patient_comb_bootstrap = list(itertools.combinations(df_encoded.loc[idx,'id_patient'],2))
        patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,columns = ['patient1','patient2'])
        #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering
        results_bootstrap = pd.merge(results, patient_comb_bootstrap, how='inner', on=['patient1','patient2'])
        # Hierarchical Clustering of the bootstrap sample
        Z_bootstrap = linkage(results_bootstrap['score'],method)
        
        c_assignments_bootstrap = cut_tree(Z_bootstrap,k)
        partition_bootstrap = cluster_indices(c_assignments_bootstrap,idx)
        
        for k_i in range(1,k+1):
            aux_jaccard = []
            aux_dice = []
            aux_asymmetric = []
            for i in range(1,k+1):
                aux = cluster_validation_indexes(partition_found[k_i-1],partition_bootstrap[i-1])
                aux_jaccard.append(aux[0])
                aux_dice.append(aux[2])
                aux_asymmetric.append(aux[1])
            
            dicio_cluster_validation[k_i]['jaccard'].append(max(aux_jaccard))
            dicio_cluster_validation[k_i]['dice'].append(max(aux_dice))
            dicio_cluster_validation[k_i]['asymmetric'].append(max(aux_asymmetric))
            
    #obtain the average cluster external indexes for each number of clusters
    jaccard_cluster_median = []
    dice_median = []
    asymmetric_median = []
    jaccard_cluster_avg = []
    dice_avg = []
    asymmetric_avg = []
    jaccard_cluster_std = []
    dice_std = []
    asymmetric_std = []
    table = []
    
    for k in range(1,k+1):
        jaccard_cluster_median.append(round(median(dicio_cluster_validation[k]['jaccard']),3))
        dice_median.append(round(median(dicio_cluster_validation[k]['dice']),3))
        asymmetric_median.append(round(median(dicio_cluster_validation[k]['asymmetric']),3))
        jaccard_cluster_avg.append(round(mean(dicio_cluster_validation[k]['jaccard']),3))
        dice_avg.append(round(mean(dicio_cluster_validation[k]['dice']),3))
        asymmetric_avg.append(round(mean(dicio_cluster_validation[k]['asymmetric']),3))
        jaccard_cluster_std.append(round(stdev(dicio_cluster_validation[k]['jaccard']),3))
        dice_std.append(round(stdev(dicio_cluster_validation[k]['dice']),3))
        asymmetric_std.append(round(stdev(dicio_cluster_validation[k]['asymmetric']),3))
    
        table.append([str(k) + ' (' + str(len(partition_found[k-1])) + ')',
                      jaccard_cluster_median[k-1], dice_median[k-1], asymmetric_median[k-1],
                      jaccard_cluster_avg[k-1], dice_avg[k-1], asymmetric_avg[k-1], 
                      jaccard_cluster_std[k-1], dice_std[k-1], asymmetric_std[k-1]])
  
    headers = ['Cluster Number', 'J_median','D_median','A_median','J_avg','D_avg','A_avg','J_std','D_std','A_std']
    #print(tabulate(table,headers))
    
    fig = plt.figure(figsize=(10,4))
    ax = plt.gca()
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.axis('tight')
    ax.axis('off')
    plt.title('Cluster stability analysis \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method))
    the_table = plt.table(cellText=table, colLabels=headers, loc='center',cellLoc='center')
    the_table.set_fontsize(8)
    the_table.scale(1.1, 1.1)
    pp.savefig(fig)
    pp.close()
    
Beispiel #3
0
#for each bootstrap sample
for i in range(M):
    # sampling rows of the original data
    idx = np.random.choice(len(df_ranks),
                           int((3 / 4) * len(df_ranks)),
                           replace=False)
    # Hierarchical Clustering of the bootstrap sample
    Z_bootstrap = linkage(df_ranks.loc[idx, :], 'ward')

    #for each number of clusters k=2,...,9
    for k in range(2, 10):
        c_assignments_original = cut_tree(Z, k)
        c_assignments_bootstrap = cut_tree(Z_bootstrap, k)
        #list of clusters for the clustering result with the original data
        partition_original = cluster_indices(c_assignments_original,
                                             df.index.tolist())
        #list of clusters for the clustering result with the bootstrap sample
        partition_bootstrap = cluster_indices(c_assignments_bootstrap, idx)

        #compute 4 different cluster external indexes between the partitions
        computed_indexes = cluster_external_index(partition_original,
                                                  partition_bootstrap)
        dicio_statistics[k]['rand'].append(computed_indexes[0])
        dicio_statistics[k]['adjusted'].append(computed_indexes[1])
        dicio_statistics[k]['FM'].append(computed_indexes[2])
        dicio_statistics[k]['jaccard'].append(computed_indexes[3])

#obtain the average cluster external indexes for each number of clusters and show the results in a table
rand_avg = []
adjusted_avg = []
FM_avg = []
def validation(M, df_encoded, results, Z, method, max_K):
    ##############################################################################
    # HOW MANY CLUSTERS?
    ###############################################################################
    # bootstrap method - sampling without replacement

    #dictionary to store all computed indexes for each number of clusters K=2,...max_K
    dicio_statistics = {k: {} for k in range(2, max_K)}
    for k in range(2, max_K):
        dicio_statistics[k]['rand'] = []
        dicio_statistics[k]['adjusted'] = []
        dicio_statistics[k]['FM'] = []
        dicio_statistics[k]['jaccard'] = []
        dicio_statistics[k]['adjusted_wallace'] = []

    #for each bootstrap sample
    for i in range(M):
        # sampling rows of the original data
        idx = np.random.choice(len(df_encoded),
                               int((3 / 4) * len(df_encoded)),
                               replace=False)
        idx = np.sort(idx)
        #get all the possible combinations between the sampled patients
        patient_comb_bootstrap = list(
            itertools.combinations(df_encoded.loc[idx, 'id_patient'], 2))
        patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,
                                              columns=['patient1', 'patient2'])
        #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering
        results_bootstrap = pd.merge(results,
                                     patient_comb_bootstrap,
                                     how='inner',
                                     on=['patient1', 'patient2'])
        # Hierarchical Clustering of the bootstrap sample
        Z_bootstrap = linkage(results_bootstrap['score'], method)

        #for each number of clusters k=2,...,max_K
        for k in range(2, max_K):
            c_assignments_original = cut_tree(Z, k)
            c_assignments_bootstrap = cut_tree(Z_bootstrap, k)
            #list of clusters for the clustering result with the original data
            partition_original = cluster_indices(c_assignments_original,
                                                 df_encoded.index.tolist())
            #list of clusters for the clustering result with the bootstrap sample
            partition_bootstrap = cluster_indices(c_assignments_bootstrap, idx)

            #compute 4 different cluster external indexes between the partitions
            computed_indexes = cluster_external_index(partition_original,
                                                      partition_bootstrap)
            #print(computed_indexes)
            dicio_statistics[k]['rand'].append(computed_indexes[0])
            dicio_statistics[k]['adjusted'].append(computed_indexes[1])
            dicio_statistics[k]['FM'].append(computed_indexes[2])
            dicio_statistics[k]['jaccard'].append(computed_indexes[3])
            dicio_statistics[k]['adjusted_wallace'].append(computed_indexes[4])

    ###########################################################################
    #  DECISION ON THE NUMBER OF CLUSTERS
    # The correct number of clusters is the k that yield most maximum average values of
    # clustering indices.
    # Also the k found before needs to have a low value of standard deviation - it has to
    # be the minimum between all k's or a value that is somehow still low compared to others
    ###########################################################################

    #dataframe that stores the clustering indices averages for each k
    df_avgs = pd.DataFrame(index=range(2, max_K),
                           columns=[
                               'k', 'Rand', 'Adjusted Rand',
                               'Fowlkes and Mallows', 'Jaccard',
                               'Adjusted Wallace', 'k_score_avg'
                           ],
                           dtype='float')
    #dataframe that stores the AR and AW indices standard deviations for each k
    df_stds = pd.DataFrame(index=range(2, max_K),
                           columns=[
                               'k', 'Rand', 'Adjusted Rand',
                               'Fowlkes and Mallows', 'Jaccard',
                               'Adjusted Wallace'
                           ],
                           dtype='float')

    #computing the means and standard deviations
    for k in range(2, max_K):
        df_avgs.loc[k]['k'] = k
        df_avgs.loc[k]['Rand'] = mean(dicio_statistics[k]['rand'])
        df_avgs.loc[k]['Adjusted Rand'] = mean(dicio_statistics[k]['adjusted'])
        df_avgs.loc[k]['Fowlkes and Mallows'] = mean(dicio_statistics[k]['FM'])
        df_avgs.loc[k]['Jaccard'] = mean(dicio_statistics[k]['jaccard'])
        df_avgs.loc[k]['Adjusted Wallace'] = mean(
            dicio_statistics[k]['adjusted_wallace'])
        df_avgs.loc[k]['k_score_avg'] = 0

        df_stds.loc[k]['k'] = k
        df_stds.loc[k]['Rand'] = stdev(dicio_statistics[k]['rand'])
        df_stds.loc[k]['Adjusted Rand'] = stdev(
            dicio_statistics[k]['adjusted'])
        df_stds.loc[k]['Fowlkes and Mallows'] = stdev(
            dicio_statistics[k]['FM'])
        df_stds.loc[k]['Jaccard'] = stdev(dicio_statistics[k]['jaccard'])
        df_stds.loc[k]['Adjusted Wallace'] = stdev(
            dicio_statistics[k]['adjusted_wallace'])
        #df_stds.loc[k]['k_score_std'] = 0
        #df_stds.loc[k]['k_score_std_2'] = 0

    #weights given to each clustering indice, Rand Index does not value as much as the other indices
    weights = {
        'Adjusted Rand': 1 / 4,
        'Fowlkes and Mallows': 1 / 4,
        'Jaccard': 1 / 4,
        'Adjusted Wallace': 1 / 4
    }
    #found the maximum value for each clustering index and locate in which k it happens
    # compute the scores for each k as being the sum of weights whenever that k has maximums of clustering indices
    for column in df_avgs.drop(columns=['k', 'Rand', 'k_score_avg']).columns:
        idx_max = df_avgs[column].idxmax()
        df_avgs.loc[idx_max]['k_score_avg'] = df_avgs.loc[idx_max][
            'k_score_avg'] + weights[column]

    #final number of clusters chosen by analysing df_avgs
    final_k = df_avgs['k_score_avg'].idxmax()

    #same approach followed as for df_avgs
    #    for column in df_stds.drop(columns = ['k','k_score_std','k_score_std_2']).columns:
    #        idx_min = df_stds[column].idxmin()
    #        idx_min_2 = df_stds[column].nsmallest(2).idxmax()
    #        df_stds.loc[idx_min]['k_score_std'] = df_stds.loc[idx_min]['k_score_std'] + weights[column]
    #        df_stds.loc[idx_min_2]['k_score_std_2'] = df_stds.loc[idx_min_2]['k_score_std_2'] + weights[column]
    #
    #At least 3 clustering indices (except Rand) have to agree on same minimum
    # std for the chosen k above
    #    if(df_stds.loc[final_k_avg]['k_score_std']>=3*(2/9)):
    #        final_k= final_k_avg
    #    elif(df_stds.loc[final_k_avg]['k_score_std_2']>=3*(2/9)):
    #        final_k = final_k_avg
    #    else:
    #        #the final k changes to the second best score with the avgs
    #        final_k_avg_2 = df_avgs['k_score_avg'].nlargest(2).idxmin()
    #        if(df_stds.loc[final_k_avg_2]['k_score_std']>=3*(2/9)):
    #            final_k = final_k_avg_2
    #        else:
    #            final_k = final_k_avg
    #
    #table_avgs= tabulate(df_avgs, headers='keys', tablefmt='psql', showindex=False)
    #print(table_avgs)
    #display(HTML(table_avgs))
    #table_stds= tabulate(df_stds, headers='keys', tablefmt='psql', showindex=False)
    #print(table_stds)

    #bar chart of standard deviation
    # Create a figure instance
    #plt.figure(2)
    #df_stds.drop(columns = 'k').plot.bar()
    #plt.show()

    #print('NUMBER OF CLUSTERS:',final_k)
    return [df_avgs, df_stds, final_k]
Beispiel #5
0
def validation(M,df_encoded,results,Z,method,min_K,max_K,automatic,pp,gap,Tp):
    ##############################################################################
    # HOW MANY CLUSTERS?
    ###############################################################################
    # bootstrap method - sampling without replacement

    #dictionary to store all computed indexes for each number of clusters K=min_K,...max_K
    dicio_statistics = {k:{} for k in range(min_K,max_K)}
    for k in range(min_K,max_K):
        dicio_statistics[k]['rand'] = []
        dicio_statistics[k]['adjusted'] = []
        dicio_statistics[k]['FM'] = []
        dicio_statistics[k]['jaccard'] = []
        dicio_statistics[k]['adjusted_wallace'] = []


    #for each bootstrap sample
    for i in range(M):
        # sampling rows of the original data
        idx = np.random.choice(len(df_encoded), int((3/4)*len(df_encoded)), replace = False)
        idx = np.sort(idx)
        #get all the possible combinations between the sampled patients
        patient_comb_bootstrap = list(itertools.combinations(df_encoded.loc[idx,'id_patient'],2))
        patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,columns = ['patient1','patient2'])
        #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering
        results_bootstrap = pd.merge(results, patient_comb_bootstrap, how='inner', on=['patient1','patient2'])
        # Hierarchical Clustering of the bootstrap sample
        Z_bootstrap = linkage(results_bootstrap['score'],method)

        #for each number of clusters k=min_K,...,max_K
        for k in range(min_K,max_K):
            c_assignments_original = cut_tree(Z,k)
            c_assignments_bootstrap = cut_tree(Z_bootstrap,k)
            #list of clusters for the clustering result with the original data
            partition_original = cluster_indices(c_assignments_original,df_encoded.index.tolist())
            #list of clusters for the clustering result with the bootstrap sample
            partition_bootstrap = cluster_indices(c_assignments_bootstrap,idx)

            #compute 4 different cluster external indexes between the partitions
            computed_indexes = cluster_external_index(partition_original,partition_bootstrap)
            #print(computed_indexes)
            dicio_statistics[k]['rand'].append(computed_indexes[0])
            dicio_statistics[k]['adjusted'].append(computed_indexes[1])
            dicio_statistics[k]['FM'].append(computed_indexes[2])
            dicio_statistics[k]['jaccard'].append(computed_indexes[3])
            dicio_statistics[k]['adjusted_wallace'].append(computed_indexes[4])


    ###########################################################################
    #  DECISION ON THE NUMBER OF CLUSTERS
    # The correct number of clusters is the k that yield most maximum average values of
    # clustering indices.
    # Also the k found before needs to have a low value of standard deviation - it has to
    # be the minimum between all k's or a value that is somehow still low compared to others
    ###########################################################################

    #dataframe that stores the clustering indices averages for each k
    df_avgs = pd.DataFrame(index = range(min_K,max_K),columns = ['k','Rand','Adjusted Rand','Fowlkes and Mallows','Jaccard','Adjusted Wallace','k_score_avg'], dtype='float')
    #dataframe that stores the AR and AW indices standard deviations for each k
    df_stds = pd.DataFrame(index = range(min_K,max_K),columns = ['k','Rand','Adjusted Rand','Fowlkes and Mallows','Jaccard','Adjusted Wallace'],dtype = 'float')

    #computing the means and standard deviations
    for k in range(min_K,max_K):
        df_avgs.loc[k]['k'] = k
        df_avgs.loc[k]['Rand'] = mean(dicio_statistics[k]['rand'])
        df_avgs.loc[k]['Adjusted Rand'] = mean(dicio_statistics[k]['adjusted'])
        df_avgs.loc[k]['Fowlkes and Mallows']= mean(dicio_statistics[k]['FM'])
        df_avgs.loc[k]['Jaccard']= mean(dicio_statistics[k]['jaccard'])
        df_avgs.loc[k]['Adjusted Wallace'] = mean(dicio_statistics[k]['adjusted_wallace'])
        df_avgs.loc[k]['k_score_avg'] = 0

        df_stds.loc[k]['k'] = k
        df_stds.loc[k]['Rand'] = stdev(dicio_statistics[k]['rand'])
        df_stds.loc[k]['Adjusted Rand'] = stdev(dicio_statistics[k]['adjusted'])
        df_stds.loc[k]['Fowlkes and Mallows']  =stdev(dicio_statistics[k]['FM'])
        df_stds.loc[k]['Jaccard'] = stdev(dicio_statistics[k]['jaccard'])
        df_stds.loc[k]['Adjusted Wallace'] = stdev(dicio_statistics[k]['adjusted_wallace'])
        #df_stds.loc[k]['k_score_std'] = 0
        #df_stds.loc[k]['k_score_std_2'] = 0

    #weights given to each clustering indice, Rand Index does not value as much as the other indices
    weights = {'Adjusted Rand': 1/4, 'Fowlkes and Mallows': 1/4,
                   'Jaccard':1/4, 'Adjusted Wallace':1/4}
    #found the maximum value for each clustering index and locate in which k it happens
    # compute the scores for each k as being the sum of weights whenever that k has maximums of clustering indices
    columns = df_avgs.columns
    analyzed_columns = columns[2:-1]
    for column in analyzed_columns:
        idx_max = df_avgs[column].idxmax()
        df_avgs.loc[idx_max]['k_score_avg'] = df_avgs.loc[idx_max]['k_score_avg'] + weights[column]

    #final number of clusters chosen by analysing df_avgs
    final_k = df_avgs['k_score_avg'].idxmax()


    if(automatic==0 or automatic==1):      

        fig = plt.figure(figsize=(10,5))
        ax = plt.gca()
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)
        ax.axis('tight')
        ax.axis('off')
        colLabels=df_avgs.loc[:, df_avgs.columns != 'k_score_avg'].columns
        cell_text = []
        for row in range(len(df_avgs)):
            cell_text.append(df_avgs.iloc[row,0:-1].round(decimals=3))
        plt.title('Average values of five clustering indices \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method))
        plt.table(cellText=cell_text, colLabels=colLabels, loc='center',cellLoc='center',fontsize=20)
        pp.savefig(fig)
        
    
        #bar chart of standard deviation - standard deviation of all measures
        # Create a figure instance
    #    plt.figure(2)
    #    df_stds.loc[:,df_stds.columns != 'k'].plot.bar(figsize=(15,8))
    #    plt.title('Standard deviation of five measures versus number of clusters',fontsize=25)
    #    plt.xlabel('Number of clusters',labelpad=20,fontsize=20)    
    #    plt.ylabel('Standard deviation',labelpad=10,fontsize=20)    
    #    plt.xticks(size = 20)
    #    plt.yticks(size = 20)
    #    plt.show()
        
        
        fig1 = plt.figure(3)
        df_stds.loc[:,'Adjusted Rand'].plot.bar(figsize=(15,8),color='forestgreen')
        plt.title('Standard deviation of Adjusted Rand versus number of clusters \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method),fontsize=25)
        plt.xlabel('Number of clusters',labelpad=20,fontsize=15)    
        plt.ylabel('Standard deviation',labelpad=10,fontsize=15)    
        plt.xticks(size = 20)
        plt.yticks(size = 20)
        #plt.show()
    
        pp.savefig(fig1)


    return [df_avgs,df_stds,final_k]
Beispiel #6
0
def cluster_validation(M, method, k, partition_found, df_encoded, results):
    #dictionary to store all computed indexes for each cluster
    dicio_cluster_validation = {k: {} for k in range(1, k + 1)}
    for k in range(1, k + 1):
        dicio_cluster_validation[k]['jaccard'] = []
        dicio_cluster_validation[k]['dice'] = []
        dicio_cluster_validation[k]['asymmetric'] = []

    #assess cluster stability for K=k that was the number of clusters chosen
    for i in range(M):
        # sampling rows of the original data
        idx = np.random.choice(len(df_encoded),
                               int((3 / 4) * len(df_encoded)),
                               replace=False)
        idx = np.sort(idx)
        #get all the possible combinations between the sampled patients
        patient_comb_bootstrap = list(
            itertools.combinations(df_encoded.loc[idx, 'id_patient'], 2))
        patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,
                                              columns=['patient1', 'patient2'])
        #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering
        results_bootstrap = pd.merge(results,
                                     patient_comb_bootstrap,
                                     how='inner',
                                     on=['patient1', 'patient2'])
        # Hierarchical Clustering of the bootstrap sample
        Z_bootstrap = linkage(results_bootstrap['score'], method)

        c_assignments_bootstrap = cut_tree(Z_bootstrap, k)
        partition_bootstrap = cluster_indices(c_assignments_bootstrap, idx)

        for k_i in range(1, k + 1):
            aux_jaccard = []
            aux_dice = []
            aux_asymmetric = []
            for i in range(1, k + 1):
                aux = cluster_validation_indexes(partition_found[k_i - 1],
                                                 partition_bootstrap[i - 1])
                aux_jaccard.append(aux[0])
                aux_dice.append(aux[2])
                aux_asymmetric.append(aux[1])

            dicio_cluster_validation[k_i]['jaccard'].append(max(aux_jaccard))
            dicio_cluster_validation[k_i]['dice'].append(max(aux_dice))
            dicio_cluster_validation[k_i]['asymmetric'].append(
                max(aux_asymmetric))

    #obtain the average cluster external indexes for each number of clusters
    jaccard_cluster_median = []
    dice_median = []
    asymmetric_median = []
    jaccard_cluster_avg = []
    dice_avg = []
    asymmetric_avg = []
    jaccard_cluster_std = []
    dice_std = []
    asymmetric_std = []
    table = []
    cluster_sizes = []

    for k in range(1, k + 1):
        jaccard_cluster_median.append(
            round(median(dicio_cluster_validation[k]['jaccard']), 3))
        dice_median.append(
            round(median(dicio_cluster_validation[k]['dice']), 3))
        asymmetric_median.append(
            round(median(dicio_cluster_validation[k]['asymmetric']), 3))
        jaccard_cluster_avg.append(
            round(mean(dicio_cluster_validation[k]['jaccard']), 3))
        dice_avg.append(round(mean(dicio_cluster_validation[k]['dice']), 3))
        asymmetric_avg.append(
            round(mean(dicio_cluster_validation[k]['asymmetric']), 3))
        jaccard_cluster_std.append(
            round(stdev(dicio_cluster_validation[k]['jaccard']), 3))
        dice_std.append(round(stdev(dicio_cluster_validation[k]['dice']), 3))
        asymmetric_std.append(
            round(stdev(dicio_cluster_validation[k]['asymmetric']), 3))

        cluster_sizes.append(len(partition_found[k - 1]))

        table.append([
            str(k) + ' (' + str(len(partition_found[k - 1])) + ')',
            jaccard_cluster_median[k - 1], dice_median[k - 1],
            asymmetric_median[k - 1], jaccard_cluster_avg[k - 1],
            dice_avg[k - 1], asymmetric_avg[k - 1], jaccard_cluster_std[k - 1],
            dice_std[k - 1], asymmetric_std[k - 1]
        ])

    headers = [
        'Cluster Number', 'J_median', 'D_median', 'A_median', 'J_avg', 'D_avg',
        'A_avg', 'J_std', 'D_std', 'A_std'
    ]
    print(tabulate(table, headers))

    cluster_stability = [
        jaccard_cluster_median, dice_median, asymmetric_median,
        jaccard_cluster_avg, dice_avg, asymmetric_avg, jaccard_cluster_std,
        dice_std, asymmetric_std, cluster_sizes
    ]

    return cluster_stability
Beispiel #7
0
def validation(M,df_encoded,results,Z,method,min_K,max_K,automatic=None,pp=None,gap=None,Tp=None):
    ##############################################################################
    # HOW MANY CLUSTERS?
    ###############################################################################
    # bootstrap method - sampling without replacement

    #dictionary to store all computed indexes for each number of clusters K=min_K,...max_K
    nn_history = defaultdict(dict)
    trees = defaultdict(dict)
    dicio_statistics = {k:{} for k in range(min_K,max_K)}

    for k in range(min_K,max_K):
        for index in indexes:
            dicio_statistics[k][index] = []

        c_assignments_original = cut_tree(Z, k)
        # list of clusters for the clustering result with the original data
        partition_original = cluster_indices(c_assignments_original, df_encoded.index.tolist())
        trees[k] = partition_original


    #for each bootstrap sample
    for i in range(M):
        # sampling rows of the original data
        idx = np.random.choice(len(df_encoded), int((3/4)*len(df_encoded)), replace = False)
        idx = np.sort(idx)
        #get all the possible combinations between the sampled patients
        patient_comb_bootstrap = list(itertools.combinations(df_encoded.loc[idx,'id_patient'],2))
        patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,columns = ['patient1','patient2'])
        #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering
        results_bootstrap = pd.merge(results, patient_comb_bootstrap, how='inner', on=['patient1','patient2'])
        # Hierarchical Clustering of the bootstrap sample
        Z_bootstrap = linkage(results_bootstrap['score'],method)

        #for each number of clusters k=min_K,...,max_K
        for k, partition in trees.items():

            c_assignments_bootstrap = cut_tree(Z_bootstrap,k)
            #list of clusters for the clustering result with the bootstrap sample
            partition_bootstrap = cluster_indices(c_assignments_bootstrap,idx)
            #compute 4 different cluster external indexes between the partitions
            #computed_indexes = cluster_external_index(partition,partition_bootstrap)
            computed_indexes = clustereval.calculate_external(partition, partition_bootstrap)



            #print(computed_indexes)
            for pos, index in enumerate(external_indexes):
                dicio_statistics[k][index].append(computed_indexes[pos])

    for k, partition in trees.items():
        calc_idx = clustereval.calculate_internal(results[['patient1', 'patient2', 'score']], partition, k, trees[max_K - 1])
        for index in internal_indexes:
            dicio_statistics[k][index].append(calc_idx[index])
    ###########################################################################
    #  DECISION ON THE NUMBER OF CLUSTERS
    # The correct number of clusters is the k that yield most maximum average values of
    # clustering indices.
    # Also the k found before needs to have a low value of standard deviation - it has to
    # be the minimum between all k's or a value that is somehow still low compared to others
    ###########################################################################

    #dataframe that stores the clustering indices averages for each k
    col = indexes.copy()
    col.extend(['k', 'k_score_avg'])
    df_avgs = pd.DataFrame(index = range(min_K,max_K),columns = col, dtype='float')
    #dataframe that stores the AR and AW indices standard deviations for each k
    df_stds = pd.DataFrame(index = range(min_K,max_K),columns = col, dtype = 'float')

    #computing the means and standard deviations
    for k in range(min_K,max_K):
        df_avgs.loc[k]['k'] = k
        df_stds.loc[k]['k'] = k
        for index in indexes:
            if index not in internal_indexes:
                df_avgs.loc[k][index] = mean(dicio_statistics[k][index])
                df_stds.loc[k][index] = stdev(dicio_statistics[k][index])
            else:
                df_avgs.loc[k][index] = dicio_statistics[k][index][0]
                df_stds.loc[k][index] = dicio_statistics[k][index][0]

        df_avgs.loc[k]['k_score_avg'] = 0
        df_stds.loc[k]['k_score_std'] = 0

        #df_stds.loc[k]['k_score_std_2'] = 0

    #weights given to each clustering indice, Rand Index does not value as much as the other indices
    weights = {index: 1/len(indexes) for index in indexes}
    #found the maximum value for each clustering index and locate in which k it happens
    # compute the scores for each k as being the sum of weights whenever that k has maximums of clustering indices
    columns = df_avgs.columns
    analyzed_columns = columns[2:-3]
    for column in analyzed_columns:

        if column in min_indexes:
            idx_min = df_avgs[column].idxmin()
            df_avgs.loc[idx_min]['k_score_avg'] = df_avgs.loc[idx_min]['k_score_avg'] + weights[column]
            continue


        idx_max = df_avgs[column].idxmax()
        df_avgs.loc[idx_max]['k_score_avg'] = df_avgs.loc[idx_max]['k_score_avg'] + weights[column]

    #idx_min_s_dbw = df_avgs['s_dbw'].idxmin()
    #idx_min_cvnn = df_avgs['cvnn'].idxmin()
    #df_avgs.loc[idx_min_s_dbw]['k_score_avg'] = df_avgs.loc[idx_min_s_dbw]['k_score_avg'] + weights['s_dbw']
    #df_avgs.loc[idx_min_cvnn]['k_score_avg'] = df_avgs.loc[idx_min_cvnn]['k_score_avg'] + weights['cvnn']

    #final number of clusters chosen by analysing df_avgs
    final_k = df_avgs['k_score_avg'].idxmax()


    if(automatic==0 or automatic==1):

        fig1 = plt.figure(figsize=(10,5))
        ax = plt.gca()
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)
        ax.axis('tight')
        ax.axis('off')
        #colLabels=df_avgs.loc[:, df_avgs.columns != 'k_score_avg'].columns
        colLabels1 = external_indexes.copy()
        colLabels1.append('k')
        cell_text1 = []
        for row in range(len(df_avgs)):
            cell_text1.append(df_avgs.iloc[row,list(range(len(external_indexes))) + [-2]].round(decimals=3))
        plt.title('Average values of eleven external indices \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method))
        the_table = plt.table(cellText=cell_text1, colLabels=colLabels1, loc='center',cellLoc='center')
        #the_table.auto_set_font_size(False)
        #the_table.set_fontsize(4)
        fig1.text(0.1, 0.01, "R = Rand, AR = Adjusted Rand, FM = Fowlkes and Mallows, J = Jaccard, AW = Adjusted Wallace, "
                      "VD = Van Dongen, H = Huberts, H' = Huberts Normalized, F = F-Measure, "
                      "VI = Variation of information, MS = Minkowski", fontsize=5)
        pp.savefig(fig1)



        fig2 = plt.figure(3, figsize=(10, 5))
        ax = plt.gca()
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)
        ax.axis('tight')
        ax.axis('off')
        # colLabels=df_avgs.loc[:, df_avgs.columns != 'k_score_avg'].columns
        colLabels2 = internal_indexes.copy()
        colLabels2.append('k')
        cell_text2 = []
        for row in range(len(df_avgs)):
            cell_text2.append(df_avgs.iloc[row, list(range(len(external_indexes), len(indexes))) + [-2]].round(decimals=3))
        plt.title('Average values of six internal indices \n gap: %.2f, Tp: %.2f, %s link' % (gap, Tp, method))
        plt.table(cellText=cell_text2, colLabels=colLabels2, loc='center', cellLoc='center', fontsize=20)
        pp.savefig(fig2)


        #bar chart of standard deviation - standard deviation of all measures
        # Create a figure instance
    #    plt.figure(2)
    #    df_stds.loc[:,df_stds.columns != 'k'].plot.bar(figsize=(15,8))
    #    plt.title('Standard deviation of five measures versus number of clusters',fontsize=25)
    #    plt.xlabel('Number of clusters',labelpad=20,fontsize=20)
    #    plt.ylabel('Standard deviation',labelpad=10,fontsize=20)
    #    plt.xticks(size = 20)
    #    plt.yticks(size = 20)
    #    plt.show()


        fig3 = plt.figure(4)
        df_stds.loc[:,'AR'].plot.bar(figsize=(15,8),color='forestgreen')
        plt.title('Standard deviation of Adjusted Rand versus number of clusters \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method),fontsize=25)
        plt.xlabel('Number of clusters',labelpad=20,fontsize=15)
        plt.ylabel('Standard deviation',labelpad=10,fontsize=15)
        plt.xticks(size = 20)
        plt.yticks(size = 20)
        #plt.show()

        pp.savefig(fig3)


    return [df_avgs,df_stds,final_k]