#alignment with the chosen gap
            results = main_algorithm(df_encoded, final_k_results['gap'], T, s,
                                     0)

            #convert similarity matrix into distance matrix
            results['score'] = convert_to_distance_matrix(results['score'])

            #hierarchical clustering
            Z = hierarchical_clustering(results['score'], method, gap)

            #compute clustering indices between partition_generated and partition_found
            c_assignments_found = cut_tree(Z, final_k_results['k'])
            partition_found = cluster_indices(c_assignments_found,
                                              df_encoded.index.tolist())
            computed_indexes = cluster_external_index(partition_generated,
                                                      partition_found)
            final_statistics['Rand'].append(computed_indexes[0])
            final_statistics['Adjusted Rand'].append(computed_indexes[1])
            final_statistics['Fowlkes and Mallows'].append(computed_indexes[2])
            final_statistics['Jaccard'].append(computed_indexes[3])
            final_statistics['Adjusted Wallace'].append(computed_indexes[4])

    if (count_correct > 1):

        final_avgs_statistics[n_sequences]['Rand'] = mean(
            final_statistics['Rand'])
        final_avgs_statistics[n_sequences]['Adjusted Rand'] = mean(
            final_statistics['Adjusted Rand'])
        final_avgs_statistics[n_sequences]['Fowlkes and Mallows'] = mean(
            final_statistics['Fowlkes and Mallows'])
        final_avgs_statistics[n_sequences]['Jaccard'] = mean(
Example #2
0
def validation(M, df_encoded, results, Z, method, max_K):
    ##############################################################################
    # HOW MANY CLUSTERS?
    ###############################################################################
    # bootstrap method - sampling without replacement

    #dictionary to store all computed indexes for each number of clusters K=2,...max_K
    dicio_statistics = {k: {} for k in range(2, max_K)}
    for k in range(2, max_K):
        dicio_statistics[k]['rand'] = []
        dicio_statistics[k]['adjusted'] = []
        dicio_statistics[k]['FM'] = []
        dicio_statistics[k]['jaccard'] = []
        dicio_statistics[k]['adjusted_wallace'] = []

    #for each bootstrap sample
    for i in range(M):
        # sampling rows of the original data
        idx = np.random.choice(len(df_encoded),
                               int((3 / 4) * len(df_encoded)),
                               replace=False)
        idx = np.sort(idx)
        #get all the possible combinations between the sampled patients
        patient_comb_bootstrap = list(
            itertools.combinations(df_encoded.loc[idx, 'id_patient'], 2))
        patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,
                                              columns=['patient1', 'patient2'])
        #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering
        results_bootstrap = pd.merge(results,
                                     patient_comb_bootstrap,
                                     how='inner',
                                     on=['patient1', 'patient2'])
        # Hierarchical Clustering of the bootstrap sample
        Z_bootstrap = linkage(results_bootstrap['score'], method)

        #for each number of clusters k=2,...,max_K
        for k in range(2, max_K):
            c_assignments_original = cut_tree(Z, k)
            c_assignments_bootstrap = cut_tree(Z_bootstrap, k)
            #list of clusters for the clustering result with the original data
            partition_original = cluster_indices(c_assignments_original,
                                                 df_encoded.index.tolist())
            #list of clusters for the clustering result with the bootstrap sample
            partition_bootstrap = cluster_indices(c_assignments_bootstrap, idx)

            #compute 4 different cluster external indexes between the partitions
            computed_indexes = cluster_external_index(partition_original,
                                                      partition_bootstrap)
            #print(computed_indexes)
            dicio_statistics[k]['rand'].append(computed_indexes[0])
            dicio_statistics[k]['adjusted'].append(computed_indexes[1])
            dicio_statistics[k]['FM'].append(computed_indexes[2])
            dicio_statistics[k]['jaccard'].append(computed_indexes[3])
            dicio_statistics[k]['adjusted_wallace'].append(computed_indexes[4])

    ###########################################################################
    #  DECISION ON THE NUMBER OF CLUSTERS
    # The correct number of clusters is the k that yield most maximum average values of
    # clustering indices.
    # Also the k found before needs to have a low value of standard deviation - it has to
    # be the minimum between all k's or a value that is somehow still low compared to others
    ###########################################################################

    #dataframe that stores the clustering indices averages for each k
    df_avgs = pd.DataFrame(index=range(2, max_K),
                           columns=[
                               'k', 'Rand', 'Adjusted Rand',
                               'Fowlkes and Mallows', 'Jaccard',
                               'Adjusted Wallace', 'k_score_avg'
                           ],
                           dtype='float')
    #dataframe that stores the AR and AW indices standard deviations for each k
    df_stds = pd.DataFrame(index=range(2, max_K),
                           columns=[
                               'k', 'Rand', 'Adjusted Rand',
                               'Fowlkes and Mallows', 'Jaccard',
                               'Adjusted Wallace'
                           ],
                           dtype='float')

    #computing the means and standard deviations
    for k in range(2, max_K):
        df_avgs.loc[k]['k'] = k
        df_avgs.loc[k]['Rand'] = mean(dicio_statistics[k]['rand'])
        df_avgs.loc[k]['Adjusted Rand'] = mean(dicio_statistics[k]['adjusted'])
        df_avgs.loc[k]['Fowlkes and Mallows'] = mean(dicio_statistics[k]['FM'])
        df_avgs.loc[k]['Jaccard'] = mean(dicio_statistics[k]['jaccard'])
        df_avgs.loc[k]['Adjusted Wallace'] = mean(
            dicio_statistics[k]['adjusted_wallace'])
        df_avgs.loc[k]['k_score_avg'] = 0

        df_stds.loc[k]['k'] = k
        df_stds.loc[k]['Rand'] = stdev(dicio_statistics[k]['rand'])
        df_stds.loc[k]['Adjusted Rand'] = stdev(
            dicio_statistics[k]['adjusted'])
        df_stds.loc[k]['Fowlkes and Mallows'] = stdev(
            dicio_statistics[k]['FM'])
        df_stds.loc[k]['Jaccard'] = stdev(dicio_statistics[k]['jaccard'])
        df_stds.loc[k]['Adjusted Wallace'] = stdev(
            dicio_statistics[k]['adjusted_wallace'])
        #df_stds.loc[k]['k_score_std'] = 0
        #df_stds.loc[k]['k_score_std_2'] = 0

    #weights given to each clustering indice, Rand Index does not value as much as the other indices
    weights = {
        'Adjusted Rand': 1 / 4,
        'Fowlkes and Mallows': 1 / 4,
        'Jaccard': 1 / 4,
        'Adjusted Wallace': 1 / 4
    }
    #found the maximum value for each clustering index and locate in which k it happens
    # compute the scores for each k as being the sum of weights whenever that k has maximums of clustering indices
    for column in df_avgs.drop(columns=['k', 'Rand', 'k_score_avg']).columns:
        idx_max = df_avgs[column].idxmax()
        df_avgs.loc[idx_max]['k_score_avg'] = df_avgs.loc[idx_max][
            'k_score_avg'] + weights[column]

    #final number of clusters chosen by analysing df_avgs
    final_k = df_avgs['k_score_avg'].idxmax()

    #same approach followed as for df_avgs
    #    for column in df_stds.drop(columns = ['k','k_score_std','k_score_std_2']).columns:
    #        idx_min = df_stds[column].idxmin()
    #        idx_min_2 = df_stds[column].nsmallest(2).idxmax()
    #        df_stds.loc[idx_min]['k_score_std'] = df_stds.loc[idx_min]['k_score_std'] + weights[column]
    #        df_stds.loc[idx_min_2]['k_score_std_2'] = df_stds.loc[idx_min_2]['k_score_std_2'] + weights[column]
    #
    #At least 3 clustering indices (except Rand) have to agree on same minimum
    # std for the chosen k above
    #    if(df_stds.loc[final_k_avg]['k_score_std']>=3*(2/9)):
    #        final_k= final_k_avg
    #    elif(df_stds.loc[final_k_avg]['k_score_std_2']>=3*(2/9)):
    #        final_k = final_k_avg
    #    else:
    #        #the final k changes to the second best score with the avgs
    #        final_k_avg_2 = df_avgs['k_score_avg'].nlargest(2).idxmin()
    #        if(df_stds.loc[final_k_avg_2]['k_score_std']>=3*(2/9)):
    #            final_k = final_k_avg_2
    #        else:
    #            final_k = final_k_avg
    #
    #table_avgs= tabulate(df_avgs, headers='keys', tablefmt='psql', showindex=False)
    #print(table_avgs)
    #display(HTML(table_avgs))
    #table_stds= tabulate(df_stds, headers='keys', tablefmt='psql', showindex=False)
    #print(table_stds)

    #bar chart of standard deviation
    # Create a figure instance
    #plt.figure(2)
    #df_stds.drop(columns = 'k').plot.bar()
    #plt.show()

    #print('NUMBER OF CLUSTERS:',final_k)
    return [df_avgs, df_stds, final_k]
Example #3
0
                           replace=False)
    # Hierarchical Clustering of the bootstrap sample
    Z_bootstrap = linkage(df_ranks.loc[idx, :], 'ward')

    #for each number of clusters k=2,...,9
    for k in range(2, 10):
        c_assignments_original = cut_tree(Z, k)
        c_assignments_bootstrap = cut_tree(Z_bootstrap, k)
        #list of clusters for the clustering result with the original data
        partition_original = cluster_indices(c_assignments_original,
                                             df.index.tolist())
        #list of clusters for the clustering result with the bootstrap sample
        partition_bootstrap = cluster_indices(c_assignments_bootstrap, idx)

        #compute 4 different cluster external indexes between the partitions
        computed_indexes = cluster_external_index(partition_original,
                                                  partition_bootstrap)
        dicio_statistics[k]['rand'].append(computed_indexes[0])
        dicio_statistics[k]['adjusted'].append(computed_indexes[1])
        dicio_statistics[k]['FM'].append(computed_indexes[2])
        dicio_statistics[k]['jaccard'].append(computed_indexes[3])

#obtain the average cluster external indexes for each number of clusters and show the results in a table
rand_avg = []
adjusted_avg = []
FM_avg = []
jaccard_avg = []
table = []
#obtain the standard deviation of adjusted rand index for each number of clusters
adjusted_std = []

for k in range(2, 10):
Example #4
0
def validation(M,df_encoded,results,Z,method,min_K,max_K,automatic,pp,gap,Tp):
    ##############################################################################
    # HOW MANY CLUSTERS?
    ###############################################################################
    # bootstrap method - sampling without replacement

    #dictionary to store all computed indexes for each number of clusters K=min_K,...max_K
    dicio_statistics = {k:{} for k in range(min_K,max_K)}
    for k in range(min_K,max_K):
        dicio_statistics[k]['rand'] = []
        dicio_statistics[k]['adjusted'] = []
        dicio_statistics[k]['FM'] = []
        dicio_statistics[k]['jaccard'] = []
        dicio_statistics[k]['adjusted_wallace'] = []


    #for each bootstrap sample
    for i in range(M):
        # sampling rows of the original data
        idx = np.random.choice(len(df_encoded), int((3/4)*len(df_encoded)), replace = False)
        idx = np.sort(idx)
        #get all the possible combinations between the sampled patients
        patient_comb_bootstrap = list(itertools.combinations(df_encoded.loc[idx,'id_patient'],2))
        patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,columns = ['patient1','patient2'])
        #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering
        results_bootstrap = pd.merge(results, patient_comb_bootstrap, how='inner', on=['patient1','patient2'])
        # Hierarchical Clustering of the bootstrap sample
        Z_bootstrap = linkage(results_bootstrap['score'],method)

        #for each number of clusters k=min_K,...,max_K
        for k in range(min_K,max_K):
            c_assignments_original = cut_tree(Z,k)
            c_assignments_bootstrap = cut_tree(Z_bootstrap,k)
            #list of clusters for the clustering result with the original data
            partition_original = cluster_indices(c_assignments_original,df_encoded.index.tolist())
            #list of clusters for the clustering result with the bootstrap sample
            partition_bootstrap = cluster_indices(c_assignments_bootstrap,idx)

            #compute 4 different cluster external indexes between the partitions
            computed_indexes = cluster_external_index(partition_original,partition_bootstrap)
            #print(computed_indexes)
            dicio_statistics[k]['rand'].append(computed_indexes[0])
            dicio_statistics[k]['adjusted'].append(computed_indexes[1])
            dicio_statistics[k]['FM'].append(computed_indexes[2])
            dicio_statistics[k]['jaccard'].append(computed_indexes[3])
            dicio_statistics[k]['adjusted_wallace'].append(computed_indexes[4])


    ###########################################################################
    #  DECISION ON THE NUMBER OF CLUSTERS
    # The correct number of clusters is the k that yield most maximum average values of
    # clustering indices.
    # Also the k found before needs to have a low value of standard deviation - it has to
    # be the minimum between all k's or a value that is somehow still low compared to others
    ###########################################################################

    #dataframe that stores the clustering indices averages for each k
    df_avgs = pd.DataFrame(index = range(min_K,max_K),columns = ['k','Rand','Adjusted Rand','Fowlkes and Mallows','Jaccard','Adjusted Wallace','k_score_avg'], dtype='float')
    #dataframe that stores the AR and AW indices standard deviations for each k
    df_stds = pd.DataFrame(index = range(min_K,max_K),columns = ['k','Rand','Adjusted Rand','Fowlkes and Mallows','Jaccard','Adjusted Wallace'],dtype = 'float')

    #computing the means and standard deviations
    for k in range(min_K,max_K):
        df_avgs.loc[k]['k'] = k
        df_avgs.loc[k]['Rand'] = mean(dicio_statistics[k]['rand'])
        df_avgs.loc[k]['Adjusted Rand'] = mean(dicio_statistics[k]['adjusted'])
        df_avgs.loc[k]['Fowlkes and Mallows']= mean(dicio_statistics[k]['FM'])
        df_avgs.loc[k]['Jaccard']= mean(dicio_statistics[k]['jaccard'])
        df_avgs.loc[k]['Adjusted Wallace'] = mean(dicio_statistics[k]['adjusted_wallace'])
        df_avgs.loc[k]['k_score_avg'] = 0

        df_stds.loc[k]['k'] = k
        df_stds.loc[k]['Rand'] = stdev(dicio_statistics[k]['rand'])
        df_stds.loc[k]['Adjusted Rand'] = stdev(dicio_statistics[k]['adjusted'])
        df_stds.loc[k]['Fowlkes and Mallows']  =stdev(dicio_statistics[k]['FM'])
        df_stds.loc[k]['Jaccard'] = stdev(dicio_statistics[k]['jaccard'])
        df_stds.loc[k]['Adjusted Wallace'] = stdev(dicio_statistics[k]['adjusted_wallace'])
        #df_stds.loc[k]['k_score_std'] = 0
        #df_stds.loc[k]['k_score_std_2'] = 0

    #weights given to each clustering indice, Rand Index does not value as much as the other indices
    weights = {'Adjusted Rand': 1/4, 'Fowlkes and Mallows': 1/4,
                   'Jaccard':1/4, 'Adjusted Wallace':1/4}
    #found the maximum value for each clustering index and locate in which k it happens
    # compute the scores for each k as being the sum of weights whenever that k has maximums of clustering indices
    columns = df_avgs.columns
    analyzed_columns = columns[2:-1]
    for column in analyzed_columns:
        idx_max = df_avgs[column].idxmax()
        df_avgs.loc[idx_max]['k_score_avg'] = df_avgs.loc[idx_max]['k_score_avg'] + weights[column]

    #final number of clusters chosen by analysing df_avgs
    final_k = df_avgs['k_score_avg'].idxmax()


    if(automatic==0 or automatic==1):      

        fig = plt.figure(figsize=(10,5))
        ax = plt.gca()
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)
        ax.axis('tight')
        ax.axis('off')
        colLabels=df_avgs.loc[:, df_avgs.columns != 'k_score_avg'].columns
        cell_text = []
        for row in range(len(df_avgs)):
            cell_text.append(df_avgs.iloc[row,0:-1].round(decimals=3))
        plt.title('Average values of five clustering indices \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method))
        plt.table(cellText=cell_text, colLabels=colLabels, loc='center',cellLoc='center',fontsize=20)
        pp.savefig(fig)
        
    
        #bar chart of standard deviation - standard deviation of all measures
        # Create a figure instance
    #    plt.figure(2)
    #    df_stds.loc[:,df_stds.columns != 'k'].plot.bar(figsize=(15,8))
    #    plt.title('Standard deviation of five measures versus number of clusters',fontsize=25)
    #    plt.xlabel('Number of clusters',labelpad=20,fontsize=20)    
    #    plt.ylabel('Standard deviation',labelpad=10,fontsize=20)    
    #    plt.xticks(size = 20)
    #    plt.yticks(size = 20)
    #    plt.show()
        
        
        fig1 = plt.figure(3)
        df_stds.loc[:,'Adjusted Rand'].plot.bar(figsize=(15,8),color='forestgreen')
        plt.title('Standard deviation of Adjusted Rand versus number of clusters \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method),fontsize=25)
        plt.xlabel('Number of clusters',labelpad=20,fontsize=15)    
        plt.ylabel('Standard deviation',labelpad=10,fontsize=15)    
        plt.xticks(size = 20)
        plt.yticks(size = 20)
        #plt.show()
    
        pp.savefig(fig1)


    return [df_avgs,df_stds,final_k]