if (final_k_results['k'] == clusters): count_correct = count_correct + 1 #alignment with the chosen gap results = main_algorithm(df_encoded, final_k_results['gap'], T, s, 0) #convert similarity matrix into distance matrix results['score'] = convert_to_distance_matrix(results['score']) #hierarchical clustering Z = hierarchical_clustering(results['score'], method, gap) #compute clustering indices between partition_generated and partition_found c_assignments_found = cut_tree(Z, final_k_results['k']) partition_found = cluster_indices(c_assignments_found, df_encoded.index.tolist()) computed_indexes = cluster_external_index(partition_generated, partition_found) final_statistics['Rand'].append(computed_indexes[0]) final_statistics['Adjusted Rand'].append(computed_indexes[1]) final_statistics['Fowlkes and Mallows'].append(computed_indexes[2]) final_statistics['Jaccard'].append(computed_indexes[3]) final_statistics['Adjusted Wallace'].append(computed_indexes[4]) if (count_correct > 1): final_avgs_statistics[n_sequences]['Rand'] = mean( final_statistics['Rand']) final_avgs_statistics[n_sequences]['Adjusted Rand'] = mean( final_statistics['Adjusted Rand']) final_avgs_statistics[n_sequences]['Fowlkes and Mallows'] = mean(
def cluster_validation(M,method,k,partition_found,df_encoded,results,gap,Tp): #write cluster stability analysis on a pdf page pp = PdfPages('cluster_stability_analysis.pdf') #dictionary to store all computed indexes for each cluster dicio_cluster_validation = {k:{} for k in range(1,k+1)} for k in range(1,k+1): dicio_cluster_validation[k]['jaccard'] = [] dicio_cluster_validation[k]['dice'] = [] dicio_cluster_validation[k]['asymmetric'] = [] #assess cluster stability for K=k that was the number of clusters chosen for i in range(M): # sampling rows of the original data idx = np.random.choice(len(df_encoded), int((3/4)*len(df_encoded)), replace = False) idx = np.sort(idx) #get all the possible combinations between the sampled patients patient_comb_bootstrap = list(itertools.combinations(df_encoded.loc[idx,'id_patient'],2)) patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,columns = ['patient1','patient2']) #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering results_bootstrap = pd.merge(results, patient_comb_bootstrap, how='inner', on=['patient1','patient2']) # Hierarchical Clustering of the bootstrap sample Z_bootstrap = linkage(results_bootstrap['score'],method) c_assignments_bootstrap = cut_tree(Z_bootstrap,k) partition_bootstrap = cluster_indices(c_assignments_bootstrap,idx) for k_i in range(1,k+1): aux_jaccard = [] aux_dice = [] aux_asymmetric = [] for i in range(1,k+1): aux = cluster_validation_indexes(partition_found[k_i-1],partition_bootstrap[i-1]) aux_jaccard.append(aux[0]) aux_dice.append(aux[2]) aux_asymmetric.append(aux[1]) dicio_cluster_validation[k_i]['jaccard'].append(max(aux_jaccard)) dicio_cluster_validation[k_i]['dice'].append(max(aux_dice)) dicio_cluster_validation[k_i]['asymmetric'].append(max(aux_asymmetric)) #obtain the average cluster external indexes for each number of clusters jaccard_cluster_median = [] dice_median = [] asymmetric_median = [] jaccard_cluster_avg = [] dice_avg = [] asymmetric_avg = [] jaccard_cluster_std = [] dice_std = [] asymmetric_std = [] table = [] for k in range(1,k+1): jaccard_cluster_median.append(round(median(dicio_cluster_validation[k]['jaccard']),3)) dice_median.append(round(median(dicio_cluster_validation[k]['dice']),3)) asymmetric_median.append(round(median(dicio_cluster_validation[k]['asymmetric']),3)) jaccard_cluster_avg.append(round(mean(dicio_cluster_validation[k]['jaccard']),3)) dice_avg.append(round(mean(dicio_cluster_validation[k]['dice']),3)) asymmetric_avg.append(round(mean(dicio_cluster_validation[k]['asymmetric']),3)) jaccard_cluster_std.append(round(stdev(dicio_cluster_validation[k]['jaccard']),3)) dice_std.append(round(stdev(dicio_cluster_validation[k]['dice']),3)) asymmetric_std.append(round(stdev(dicio_cluster_validation[k]['asymmetric']),3)) table.append([str(k) + ' (' + str(len(partition_found[k-1])) + ')', jaccard_cluster_median[k-1], dice_median[k-1], asymmetric_median[k-1], jaccard_cluster_avg[k-1], dice_avg[k-1], asymmetric_avg[k-1], jaccard_cluster_std[k-1], dice_std[k-1], asymmetric_std[k-1]]) headers = ['Cluster Number', 'J_median','D_median','A_median','J_avg','D_avg','A_avg','J_std','D_std','A_std'] #print(tabulate(table,headers)) fig = plt.figure(figsize=(10,4)) ax = plt.gca() ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) ax.axis('tight') ax.axis('off') plt.title('Cluster stability analysis \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method)) the_table = plt.table(cellText=table, colLabels=headers, loc='center',cellLoc='center') the_table.set_fontsize(8) the_table.scale(1.1, 1.1) pp.savefig(fig) pp.close()
#for each bootstrap sample for i in range(M): # sampling rows of the original data idx = np.random.choice(len(df_ranks), int((3 / 4) * len(df_ranks)), replace=False) # Hierarchical Clustering of the bootstrap sample Z_bootstrap = linkage(df_ranks.loc[idx, :], 'ward') #for each number of clusters k=2,...,9 for k in range(2, 10): c_assignments_original = cut_tree(Z, k) c_assignments_bootstrap = cut_tree(Z_bootstrap, k) #list of clusters for the clustering result with the original data partition_original = cluster_indices(c_assignments_original, df.index.tolist()) #list of clusters for the clustering result with the bootstrap sample partition_bootstrap = cluster_indices(c_assignments_bootstrap, idx) #compute 4 different cluster external indexes between the partitions computed_indexes = cluster_external_index(partition_original, partition_bootstrap) dicio_statistics[k]['rand'].append(computed_indexes[0]) dicio_statistics[k]['adjusted'].append(computed_indexes[1]) dicio_statistics[k]['FM'].append(computed_indexes[2]) dicio_statistics[k]['jaccard'].append(computed_indexes[3]) #obtain the average cluster external indexes for each number of clusters and show the results in a table rand_avg = [] adjusted_avg = [] FM_avg = []
def validation(M, df_encoded, results, Z, method, max_K): ############################################################################## # HOW MANY CLUSTERS? ############################################################################### # bootstrap method - sampling without replacement #dictionary to store all computed indexes for each number of clusters K=2,...max_K dicio_statistics = {k: {} for k in range(2, max_K)} for k in range(2, max_K): dicio_statistics[k]['rand'] = [] dicio_statistics[k]['adjusted'] = [] dicio_statistics[k]['FM'] = [] dicio_statistics[k]['jaccard'] = [] dicio_statistics[k]['adjusted_wallace'] = [] #for each bootstrap sample for i in range(M): # sampling rows of the original data idx = np.random.choice(len(df_encoded), int((3 / 4) * len(df_encoded)), replace=False) idx = np.sort(idx) #get all the possible combinations between the sampled patients patient_comb_bootstrap = list( itertools.combinations(df_encoded.loc[idx, 'id_patient'], 2)) patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap, columns=['patient1', 'patient2']) #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering results_bootstrap = pd.merge(results, patient_comb_bootstrap, how='inner', on=['patient1', 'patient2']) # Hierarchical Clustering of the bootstrap sample Z_bootstrap = linkage(results_bootstrap['score'], method) #for each number of clusters k=2,...,max_K for k in range(2, max_K): c_assignments_original = cut_tree(Z, k) c_assignments_bootstrap = cut_tree(Z_bootstrap, k) #list of clusters for the clustering result with the original data partition_original = cluster_indices(c_assignments_original, df_encoded.index.tolist()) #list of clusters for the clustering result with the bootstrap sample partition_bootstrap = cluster_indices(c_assignments_bootstrap, idx) #compute 4 different cluster external indexes between the partitions computed_indexes = cluster_external_index(partition_original, partition_bootstrap) #print(computed_indexes) dicio_statistics[k]['rand'].append(computed_indexes[0]) dicio_statistics[k]['adjusted'].append(computed_indexes[1]) dicio_statistics[k]['FM'].append(computed_indexes[2]) dicio_statistics[k]['jaccard'].append(computed_indexes[3]) dicio_statistics[k]['adjusted_wallace'].append(computed_indexes[4]) ########################################################################### # DECISION ON THE NUMBER OF CLUSTERS # The correct number of clusters is the k that yield most maximum average values of # clustering indices. # Also the k found before needs to have a low value of standard deviation - it has to # be the minimum between all k's or a value that is somehow still low compared to others ########################################################################### #dataframe that stores the clustering indices averages for each k df_avgs = pd.DataFrame(index=range(2, max_K), columns=[ 'k', 'Rand', 'Adjusted Rand', 'Fowlkes and Mallows', 'Jaccard', 'Adjusted Wallace', 'k_score_avg' ], dtype='float') #dataframe that stores the AR and AW indices standard deviations for each k df_stds = pd.DataFrame(index=range(2, max_K), columns=[ 'k', 'Rand', 'Adjusted Rand', 'Fowlkes and Mallows', 'Jaccard', 'Adjusted Wallace' ], dtype='float') #computing the means and standard deviations for k in range(2, max_K): df_avgs.loc[k]['k'] = k df_avgs.loc[k]['Rand'] = mean(dicio_statistics[k]['rand']) df_avgs.loc[k]['Adjusted Rand'] = mean(dicio_statistics[k]['adjusted']) df_avgs.loc[k]['Fowlkes and Mallows'] = mean(dicio_statistics[k]['FM']) df_avgs.loc[k]['Jaccard'] = mean(dicio_statistics[k]['jaccard']) df_avgs.loc[k]['Adjusted Wallace'] = mean( dicio_statistics[k]['adjusted_wallace']) df_avgs.loc[k]['k_score_avg'] = 0 df_stds.loc[k]['k'] = k df_stds.loc[k]['Rand'] = stdev(dicio_statistics[k]['rand']) df_stds.loc[k]['Adjusted Rand'] = stdev( dicio_statistics[k]['adjusted']) df_stds.loc[k]['Fowlkes and Mallows'] = stdev( dicio_statistics[k]['FM']) df_stds.loc[k]['Jaccard'] = stdev(dicio_statistics[k]['jaccard']) df_stds.loc[k]['Adjusted Wallace'] = stdev( dicio_statistics[k]['adjusted_wallace']) #df_stds.loc[k]['k_score_std'] = 0 #df_stds.loc[k]['k_score_std_2'] = 0 #weights given to each clustering indice, Rand Index does not value as much as the other indices weights = { 'Adjusted Rand': 1 / 4, 'Fowlkes and Mallows': 1 / 4, 'Jaccard': 1 / 4, 'Adjusted Wallace': 1 / 4 } #found the maximum value for each clustering index and locate in which k it happens # compute the scores for each k as being the sum of weights whenever that k has maximums of clustering indices for column in df_avgs.drop(columns=['k', 'Rand', 'k_score_avg']).columns: idx_max = df_avgs[column].idxmax() df_avgs.loc[idx_max]['k_score_avg'] = df_avgs.loc[idx_max][ 'k_score_avg'] + weights[column] #final number of clusters chosen by analysing df_avgs final_k = df_avgs['k_score_avg'].idxmax() #same approach followed as for df_avgs # for column in df_stds.drop(columns = ['k','k_score_std','k_score_std_2']).columns: # idx_min = df_stds[column].idxmin() # idx_min_2 = df_stds[column].nsmallest(2).idxmax() # df_stds.loc[idx_min]['k_score_std'] = df_stds.loc[idx_min]['k_score_std'] + weights[column] # df_stds.loc[idx_min_2]['k_score_std_2'] = df_stds.loc[idx_min_2]['k_score_std_2'] + weights[column] # #At least 3 clustering indices (except Rand) have to agree on same minimum # std for the chosen k above # if(df_stds.loc[final_k_avg]['k_score_std']>=3*(2/9)): # final_k= final_k_avg # elif(df_stds.loc[final_k_avg]['k_score_std_2']>=3*(2/9)): # final_k = final_k_avg # else: # #the final k changes to the second best score with the avgs # final_k_avg_2 = df_avgs['k_score_avg'].nlargest(2).idxmin() # if(df_stds.loc[final_k_avg_2]['k_score_std']>=3*(2/9)): # final_k = final_k_avg_2 # else: # final_k = final_k_avg # #table_avgs= tabulate(df_avgs, headers='keys', tablefmt='psql', showindex=False) #print(table_avgs) #display(HTML(table_avgs)) #table_stds= tabulate(df_stds, headers='keys', tablefmt='psql', showindex=False) #print(table_stds) #bar chart of standard deviation # Create a figure instance #plt.figure(2) #df_stds.drop(columns = 'k').plot.bar() #plt.show() #print('NUMBER OF CLUSTERS:',final_k) return [df_avgs, df_stds, final_k]
def validation(M,df_encoded,results,Z,method,min_K,max_K,automatic,pp,gap,Tp): ############################################################################## # HOW MANY CLUSTERS? ############################################################################### # bootstrap method - sampling without replacement #dictionary to store all computed indexes for each number of clusters K=min_K,...max_K dicio_statistics = {k:{} for k in range(min_K,max_K)} for k in range(min_K,max_K): dicio_statistics[k]['rand'] = [] dicio_statistics[k]['adjusted'] = [] dicio_statistics[k]['FM'] = [] dicio_statistics[k]['jaccard'] = [] dicio_statistics[k]['adjusted_wallace'] = [] #for each bootstrap sample for i in range(M): # sampling rows of the original data idx = np.random.choice(len(df_encoded), int((3/4)*len(df_encoded)), replace = False) idx = np.sort(idx) #get all the possible combinations between the sampled patients patient_comb_bootstrap = list(itertools.combinations(df_encoded.loc[idx,'id_patient'],2)) patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,columns = ['patient1','patient2']) #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering results_bootstrap = pd.merge(results, patient_comb_bootstrap, how='inner', on=['patient1','patient2']) # Hierarchical Clustering of the bootstrap sample Z_bootstrap = linkage(results_bootstrap['score'],method) #for each number of clusters k=min_K,...,max_K for k in range(min_K,max_K): c_assignments_original = cut_tree(Z,k) c_assignments_bootstrap = cut_tree(Z_bootstrap,k) #list of clusters for the clustering result with the original data partition_original = cluster_indices(c_assignments_original,df_encoded.index.tolist()) #list of clusters for the clustering result with the bootstrap sample partition_bootstrap = cluster_indices(c_assignments_bootstrap,idx) #compute 4 different cluster external indexes between the partitions computed_indexes = cluster_external_index(partition_original,partition_bootstrap) #print(computed_indexes) dicio_statistics[k]['rand'].append(computed_indexes[0]) dicio_statistics[k]['adjusted'].append(computed_indexes[1]) dicio_statistics[k]['FM'].append(computed_indexes[2]) dicio_statistics[k]['jaccard'].append(computed_indexes[3]) dicio_statistics[k]['adjusted_wallace'].append(computed_indexes[4]) ########################################################################### # DECISION ON THE NUMBER OF CLUSTERS # The correct number of clusters is the k that yield most maximum average values of # clustering indices. # Also the k found before needs to have a low value of standard deviation - it has to # be the minimum between all k's or a value that is somehow still low compared to others ########################################################################### #dataframe that stores the clustering indices averages for each k df_avgs = pd.DataFrame(index = range(min_K,max_K),columns = ['k','Rand','Adjusted Rand','Fowlkes and Mallows','Jaccard','Adjusted Wallace','k_score_avg'], dtype='float') #dataframe that stores the AR and AW indices standard deviations for each k df_stds = pd.DataFrame(index = range(min_K,max_K),columns = ['k','Rand','Adjusted Rand','Fowlkes and Mallows','Jaccard','Adjusted Wallace'],dtype = 'float') #computing the means and standard deviations for k in range(min_K,max_K): df_avgs.loc[k]['k'] = k df_avgs.loc[k]['Rand'] = mean(dicio_statistics[k]['rand']) df_avgs.loc[k]['Adjusted Rand'] = mean(dicio_statistics[k]['adjusted']) df_avgs.loc[k]['Fowlkes and Mallows']= mean(dicio_statistics[k]['FM']) df_avgs.loc[k]['Jaccard']= mean(dicio_statistics[k]['jaccard']) df_avgs.loc[k]['Adjusted Wallace'] = mean(dicio_statistics[k]['adjusted_wallace']) df_avgs.loc[k]['k_score_avg'] = 0 df_stds.loc[k]['k'] = k df_stds.loc[k]['Rand'] = stdev(dicio_statistics[k]['rand']) df_stds.loc[k]['Adjusted Rand'] = stdev(dicio_statistics[k]['adjusted']) df_stds.loc[k]['Fowlkes and Mallows'] =stdev(dicio_statistics[k]['FM']) df_stds.loc[k]['Jaccard'] = stdev(dicio_statistics[k]['jaccard']) df_stds.loc[k]['Adjusted Wallace'] = stdev(dicio_statistics[k]['adjusted_wallace']) #df_stds.loc[k]['k_score_std'] = 0 #df_stds.loc[k]['k_score_std_2'] = 0 #weights given to each clustering indice, Rand Index does not value as much as the other indices weights = {'Adjusted Rand': 1/4, 'Fowlkes and Mallows': 1/4, 'Jaccard':1/4, 'Adjusted Wallace':1/4} #found the maximum value for each clustering index and locate in which k it happens # compute the scores for each k as being the sum of weights whenever that k has maximums of clustering indices columns = df_avgs.columns analyzed_columns = columns[2:-1] for column in analyzed_columns: idx_max = df_avgs[column].idxmax() df_avgs.loc[idx_max]['k_score_avg'] = df_avgs.loc[idx_max]['k_score_avg'] + weights[column] #final number of clusters chosen by analysing df_avgs final_k = df_avgs['k_score_avg'].idxmax() if(automatic==0 or automatic==1): fig = plt.figure(figsize=(10,5)) ax = plt.gca() ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) ax.axis('tight') ax.axis('off') colLabels=df_avgs.loc[:, df_avgs.columns != 'k_score_avg'].columns cell_text = [] for row in range(len(df_avgs)): cell_text.append(df_avgs.iloc[row,0:-1].round(decimals=3)) plt.title('Average values of five clustering indices \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method)) plt.table(cellText=cell_text, colLabels=colLabels, loc='center',cellLoc='center',fontsize=20) pp.savefig(fig) #bar chart of standard deviation - standard deviation of all measures # Create a figure instance # plt.figure(2) # df_stds.loc[:,df_stds.columns != 'k'].plot.bar(figsize=(15,8)) # plt.title('Standard deviation of five measures versus number of clusters',fontsize=25) # plt.xlabel('Number of clusters',labelpad=20,fontsize=20) # plt.ylabel('Standard deviation',labelpad=10,fontsize=20) # plt.xticks(size = 20) # plt.yticks(size = 20) # plt.show() fig1 = plt.figure(3) df_stds.loc[:,'Adjusted Rand'].plot.bar(figsize=(15,8),color='forestgreen') plt.title('Standard deviation of Adjusted Rand versus number of clusters \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method),fontsize=25) plt.xlabel('Number of clusters',labelpad=20,fontsize=15) plt.ylabel('Standard deviation',labelpad=10,fontsize=15) plt.xticks(size = 20) plt.yticks(size = 20) #plt.show() pp.savefig(fig1) return [df_avgs,df_stds,final_k]
def cluster_validation(M, method, k, partition_found, df_encoded, results): #dictionary to store all computed indexes for each cluster dicio_cluster_validation = {k: {} for k in range(1, k + 1)} for k in range(1, k + 1): dicio_cluster_validation[k]['jaccard'] = [] dicio_cluster_validation[k]['dice'] = [] dicio_cluster_validation[k]['asymmetric'] = [] #assess cluster stability for K=k that was the number of clusters chosen for i in range(M): # sampling rows of the original data idx = np.random.choice(len(df_encoded), int((3 / 4) * len(df_encoded)), replace=False) idx = np.sort(idx) #get all the possible combinations between the sampled patients patient_comb_bootstrap = list( itertools.combinations(df_encoded.loc[idx, 'id_patient'], 2)) patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap, columns=['patient1', 'patient2']) #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering results_bootstrap = pd.merge(results, patient_comb_bootstrap, how='inner', on=['patient1', 'patient2']) # Hierarchical Clustering of the bootstrap sample Z_bootstrap = linkage(results_bootstrap['score'], method) c_assignments_bootstrap = cut_tree(Z_bootstrap, k) partition_bootstrap = cluster_indices(c_assignments_bootstrap, idx) for k_i in range(1, k + 1): aux_jaccard = [] aux_dice = [] aux_asymmetric = [] for i in range(1, k + 1): aux = cluster_validation_indexes(partition_found[k_i - 1], partition_bootstrap[i - 1]) aux_jaccard.append(aux[0]) aux_dice.append(aux[2]) aux_asymmetric.append(aux[1]) dicio_cluster_validation[k_i]['jaccard'].append(max(aux_jaccard)) dicio_cluster_validation[k_i]['dice'].append(max(aux_dice)) dicio_cluster_validation[k_i]['asymmetric'].append( max(aux_asymmetric)) #obtain the average cluster external indexes for each number of clusters jaccard_cluster_median = [] dice_median = [] asymmetric_median = [] jaccard_cluster_avg = [] dice_avg = [] asymmetric_avg = [] jaccard_cluster_std = [] dice_std = [] asymmetric_std = [] table = [] cluster_sizes = [] for k in range(1, k + 1): jaccard_cluster_median.append( round(median(dicio_cluster_validation[k]['jaccard']), 3)) dice_median.append( round(median(dicio_cluster_validation[k]['dice']), 3)) asymmetric_median.append( round(median(dicio_cluster_validation[k]['asymmetric']), 3)) jaccard_cluster_avg.append( round(mean(dicio_cluster_validation[k]['jaccard']), 3)) dice_avg.append(round(mean(dicio_cluster_validation[k]['dice']), 3)) asymmetric_avg.append( round(mean(dicio_cluster_validation[k]['asymmetric']), 3)) jaccard_cluster_std.append( round(stdev(dicio_cluster_validation[k]['jaccard']), 3)) dice_std.append(round(stdev(dicio_cluster_validation[k]['dice']), 3)) asymmetric_std.append( round(stdev(dicio_cluster_validation[k]['asymmetric']), 3)) cluster_sizes.append(len(partition_found[k - 1])) table.append([ str(k) + ' (' + str(len(partition_found[k - 1])) + ')', jaccard_cluster_median[k - 1], dice_median[k - 1], asymmetric_median[k - 1], jaccard_cluster_avg[k - 1], dice_avg[k - 1], asymmetric_avg[k - 1], jaccard_cluster_std[k - 1], dice_std[k - 1], asymmetric_std[k - 1] ]) headers = [ 'Cluster Number', 'J_median', 'D_median', 'A_median', 'J_avg', 'D_avg', 'A_avg', 'J_std', 'D_std', 'A_std' ] print(tabulate(table, headers)) cluster_stability = [ jaccard_cluster_median, dice_median, asymmetric_median, jaccard_cluster_avg, dice_avg, asymmetric_avg, jaccard_cluster_std, dice_std, asymmetric_std, cluster_sizes ] return cluster_stability
def validation(M,df_encoded,results,Z,method,min_K,max_K,automatic=None,pp=None,gap=None,Tp=None): ############################################################################## # HOW MANY CLUSTERS? ############################################################################### # bootstrap method - sampling without replacement #dictionary to store all computed indexes for each number of clusters K=min_K,...max_K nn_history = defaultdict(dict) trees = defaultdict(dict) dicio_statistics = {k:{} for k in range(min_K,max_K)} for k in range(min_K,max_K): for index in indexes: dicio_statistics[k][index] = [] c_assignments_original = cut_tree(Z, k) # list of clusters for the clustering result with the original data partition_original = cluster_indices(c_assignments_original, df_encoded.index.tolist()) trees[k] = partition_original #for each bootstrap sample for i in range(M): # sampling rows of the original data idx = np.random.choice(len(df_encoded), int((3/4)*len(df_encoded)), replace = False) idx = np.sort(idx) #get all the possible combinations between the sampled patients patient_comb_bootstrap = list(itertools.combinations(df_encoded.loc[idx,'id_patient'],2)) patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,columns = ['patient1','patient2']) #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering results_bootstrap = pd.merge(results, patient_comb_bootstrap, how='inner', on=['patient1','patient2']) # Hierarchical Clustering of the bootstrap sample Z_bootstrap = linkage(results_bootstrap['score'],method) #for each number of clusters k=min_K,...,max_K for k, partition in trees.items(): c_assignments_bootstrap = cut_tree(Z_bootstrap,k) #list of clusters for the clustering result with the bootstrap sample partition_bootstrap = cluster_indices(c_assignments_bootstrap,idx) #compute 4 different cluster external indexes between the partitions #computed_indexes = cluster_external_index(partition,partition_bootstrap) computed_indexes = clustereval.calculate_external(partition, partition_bootstrap) #print(computed_indexes) for pos, index in enumerate(external_indexes): dicio_statistics[k][index].append(computed_indexes[pos]) for k, partition in trees.items(): calc_idx = clustereval.calculate_internal(results[['patient1', 'patient2', 'score']], partition, k, trees[max_K - 1]) for index in internal_indexes: dicio_statistics[k][index].append(calc_idx[index]) ########################################################################### # DECISION ON THE NUMBER OF CLUSTERS # The correct number of clusters is the k that yield most maximum average values of # clustering indices. # Also the k found before needs to have a low value of standard deviation - it has to # be the minimum between all k's or a value that is somehow still low compared to others ########################################################################### #dataframe that stores the clustering indices averages for each k col = indexes.copy() col.extend(['k', 'k_score_avg']) df_avgs = pd.DataFrame(index = range(min_K,max_K),columns = col, dtype='float') #dataframe that stores the AR and AW indices standard deviations for each k df_stds = pd.DataFrame(index = range(min_K,max_K),columns = col, dtype = 'float') #computing the means and standard deviations for k in range(min_K,max_K): df_avgs.loc[k]['k'] = k df_stds.loc[k]['k'] = k for index in indexes: if index not in internal_indexes: df_avgs.loc[k][index] = mean(dicio_statistics[k][index]) df_stds.loc[k][index] = stdev(dicio_statistics[k][index]) else: df_avgs.loc[k][index] = dicio_statistics[k][index][0] df_stds.loc[k][index] = dicio_statistics[k][index][0] df_avgs.loc[k]['k_score_avg'] = 0 df_stds.loc[k]['k_score_std'] = 0 #df_stds.loc[k]['k_score_std_2'] = 0 #weights given to each clustering indice, Rand Index does not value as much as the other indices weights = {index: 1/len(indexes) for index in indexes} #found the maximum value for each clustering index and locate in which k it happens # compute the scores for each k as being the sum of weights whenever that k has maximums of clustering indices columns = df_avgs.columns analyzed_columns = columns[2:-3] for column in analyzed_columns: if column in min_indexes: idx_min = df_avgs[column].idxmin() df_avgs.loc[idx_min]['k_score_avg'] = df_avgs.loc[idx_min]['k_score_avg'] + weights[column] continue idx_max = df_avgs[column].idxmax() df_avgs.loc[idx_max]['k_score_avg'] = df_avgs.loc[idx_max]['k_score_avg'] + weights[column] #idx_min_s_dbw = df_avgs['s_dbw'].idxmin() #idx_min_cvnn = df_avgs['cvnn'].idxmin() #df_avgs.loc[idx_min_s_dbw]['k_score_avg'] = df_avgs.loc[idx_min_s_dbw]['k_score_avg'] + weights['s_dbw'] #df_avgs.loc[idx_min_cvnn]['k_score_avg'] = df_avgs.loc[idx_min_cvnn]['k_score_avg'] + weights['cvnn'] #final number of clusters chosen by analysing df_avgs final_k = df_avgs['k_score_avg'].idxmax() if(automatic==0 or automatic==1): fig1 = plt.figure(figsize=(10,5)) ax = plt.gca() ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) ax.axis('tight') ax.axis('off') #colLabels=df_avgs.loc[:, df_avgs.columns != 'k_score_avg'].columns colLabels1 = external_indexes.copy() colLabels1.append('k') cell_text1 = [] for row in range(len(df_avgs)): cell_text1.append(df_avgs.iloc[row,list(range(len(external_indexes))) + [-2]].round(decimals=3)) plt.title('Average values of eleven external indices \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method)) the_table = plt.table(cellText=cell_text1, colLabels=colLabels1, loc='center',cellLoc='center') #the_table.auto_set_font_size(False) #the_table.set_fontsize(4) fig1.text(0.1, 0.01, "R = Rand, AR = Adjusted Rand, FM = Fowlkes and Mallows, J = Jaccard, AW = Adjusted Wallace, " "VD = Van Dongen, H = Huberts, H' = Huberts Normalized, F = F-Measure, " "VI = Variation of information, MS = Minkowski", fontsize=5) pp.savefig(fig1) fig2 = plt.figure(3, figsize=(10, 5)) ax = plt.gca() ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) ax.axis('tight') ax.axis('off') # colLabels=df_avgs.loc[:, df_avgs.columns != 'k_score_avg'].columns colLabels2 = internal_indexes.copy() colLabels2.append('k') cell_text2 = [] for row in range(len(df_avgs)): cell_text2.append(df_avgs.iloc[row, list(range(len(external_indexes), len(indexes))) + [-2]].round(decimals=3)) plt.title('Average values of six internal indices \n gap: %.2f, Tp: %.2f, %s link' % (gap, Tp, method)) plt.table(cellText=cell_text2, colLabels=colLabels2, loc='center', cellLoc='center', fontsize=20) pp.savefig(fig2) #bar chart of standard deviation - standard deviation of all measures # Create a figure instance # plt.figure(2) # df_stds.loc[:,df_stds.columns != 'k'].plot.bar(figsize=(15,8)) # plt.title('Standard deviation of five measures versus number of clusters',fontsize=25) # plt.xlabel('Number of clusters',labelpad=20,fontsize=20) # plt.ylabel('Standard deviation',labelpad=10,fontsize=20) # plt.xticks(size = 20) # plt.yticks(size = 20) # plt.show() fig3 = plt.figure(4) df_stds.loc[:,'AR'].plot.bar(figsize=(15,8),color='forestgreen') plt.title('Standard deviation of Adjusted Rand versus number of clusters \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method),fontsize=25) plt.xlabel('Number of clusters',labelpad=20,fontsize=15) plt.ylabel('Standard deviation',labelpad=10,fontsize=15) plt.xticks(size = 20) plt.yticks(size = 20) #plt.show() pp.savefig(fig3) return [df_avgs,df_stds,final_k]