def chooseInitialCentroids(data_set,CatAtr,NumAtr,clust_numb): subsetCat = data_set[CatAtr].as_matrix() subsetNum = data_set[NumAtr].as_matrix() U = data_set.shape[0] D = np.zeros(shape = (U,U)) Ar = len(NumAtr) Ac = len(CatAtr) A = Ar+Ac if Ar and Ac: for i in range(U): for j in range(U): D[i][j] = (Ar/A)*ds.Eucdist(subsetNum[i],subsetNum[j]) + (Ac/A)*ds.hamdist(subsetCat[i],subsetCat[j]) elif not Ar: for i in range(U): for j in range(U): D[i][j] = ds.hamdist(subsetCat[i],subsetCat[j]) elif not Ac: for i in range(U): for j in range(U): D[i][j] = ds.Eucdist(subsetNum[i],subsetNum[j]) centers = [] remembered_indexes = [i for i in range(U)] step = int(U/clust_numb) for i in range(clust_numb): sums_axis = [] sums_axis = list(np.sum(D,axis = 0)) max_ind = sums_axis.index(max(sums_axis)) max_ind_real = remembered_indexes[max_ind] centers.append(max_ind_real) max_ind_row = list(D[max_ind,:]) for j in range(step): min_ind_inner = max_ind_row.index(min(max_ind_row)) D = np.delete(D, (min_ind_inner), axis=0) D = np.delete(D, (min_ind_inner), axis=1) remembered_indexes.pop(min_ind_inner) max_ind_row.pop(min_ind_inner) return centers
def dunnIndex(data_set, CatAtr, NumAtr, clusters): clusters_numb = len(clusters) subsetCat = data_set[CatAtr].as_matrix() subsetNum = data_set[NumAtr].as_matrix() U = data_set.shape[0] D = np.zeros(shape=(U, U)) Ar = len(NumAtr) Ac = len(CatAtr) A = Ar + Ac if Ar and Ac: for i in range(U): for j in range(U): D[i][j] = (Ar / A) * ds.Eucdist(subsetNum[i], subsetNum[j]) + ( Ac / A) * ds.hamdist(subsetCat[i], subsetCat[j]) elif not Ar: for i in range(U): for j in range(U): D[i][j] = ds.hamdist(subsetCat[i], subsetCat[j]) elif not Ac: for i in range(U): for j in range(U): D[i][j] = ds.Eucdist(subsetNum[i], subsetNum[j]) nc = len(clusters) interClust = np.zeros(shape=(nc, nc)) intraClust = np.zeros(shape=(1, nc)) interClust = np.empty(( nc, nc, )) interClust[:] = np.NAN for i in range(nc): c1 = clusters[i] for j in range(nc): if j == i: D_sub = D[c1, :] D_sub = D_sub[:, c1] intraClust[0][i] = np.max(np.max(D_sub, axis=0)) if j > i: c2 = clusters[j] D_sub_j = D[c1, :] D_sub_j = D_sub_j[:, c2] interClust[i][j] = np.min(np.min(D_sub_j, axis=0)) return np.nanmin(interClust) / np.max(intraClust)
def diameter(subsetNum, subsetCat, cluster_indexes, CatAtr, NumAtr, prototype, clust_numb): Ar = len(NumAtr) Ac = len(CatAtr) A = Ar + Ac ss = 0 if Ar and Ac: for x in cluster_indexes: ss += (Ar / A) * ds.Eucdist(subsetNum[x], prototype[Ac:A]) + ( Ac / A) * ds.hamdist(subsetCat[x], prototype[0:Ac]) return ss / len(cluster_indexes) elif not Ar: for x in cluster_indexes: ss += ds.hamdist(subsetCat[x], prototype[0:Ac]) return ss / len(cluster_indexes) elif not Ac: for x in cluster_indexes: ss += ds.Eucdist(subsetNum[x], prototype[Ac:A]) return ss / len(cluster_indexes)
def Kprototypes(dataset,CatAtr,NumAtr,Z,z,x): subsetCat = dataset[CatAtr].as_matrix() subsetNum = dataset[NumAtr].as_matrix() Ar = len(NumAtr) Ac = len(CatAtr) A = Ar+Ac SN = 0 SD = 0 if Ar and Ac: for key,prototype in Z.items(): SN += ds.Eucdist(subsetNum[x],prototype[Ac:A]) SD += ds.hamdist(subsetCat[x],prototype[0:Ac]) return((Ar/(A*SN))*ds.Eucdist(subsetNum[x],z[Ac:A])+(Ac/(A*SD))*ds.hamdist(subsetCat[x],z[0:Ac])) elif not Ar: for key,prototype in Z.items(): SD += ds.hamdist(subsetCat[x],prototype[0:Ac]) return((Ac/(A*SD))*ds.hamdist(subsetCat[x],z[0:Ac])) elif not Ac: for key,prototype in Z.items(): SN += ds.Eucdist(subsetNum[x],prototype[0:Ar]) return((Ar/(A*SN))*ds.Eucdist(subsetNum[x],z[0:Ar]))
def BE_C(data_set, cluster1, cluster2, clusters, CatAtr): ''' Two clusters validation index for categorical attrib p. 5, eq. 16 ''' subset1 = data_set[CatAtr].values[clusters[cluster1]] subset2 = data_set[CatAtr].values[clusters[cluster2]] N1 = subset1.shape[0] N2 = subset2.shape[0] s = 0 for i in range(N1): for j in range(N2): s += ds.hamdist(subset1[i], subset2[j]) return s / (N1 * N2)
def DaviesBouldin(data_set, CatAtr, NumAtr, clusters, prototypes): clusters_numb = len(clusters) subsetCat = data_set[CatAtr].as_matrix() subsetNum = data_set[NumAtr].as_matrix() Ar = len(NumAtr) Ac = len(CatAtr) A = Ar + Ac if Ar and Ac: s = [] for i, cluster_i in enumerate(clusters.values()): prototype_i = prototypes[i] d_i = diameter(subsetNum, subsetCat, cluster_i, CatAtr, NumAtr, prototype_i, clusters_numb) t = 0 for j, cluster_j in enumerate(clusters.values()): if i != j: prototype_j = prototypes[j] d_i = diameter(subsetNum, subsetCat, cluster_i, CatAtr, NumAtr, prototype_i, clusters_numb) d_j = diameter(subsetNum, subsetCat, cluster_j, CatAtr, NumAtr, prototype_j, clusters_numb) d_ij = (Ar / A) * ds.Eucdist( prototype_i[Ac:A], prototype_j[Ac:A]) + (Ac / A) * ds.hamdist( prototype_i[0:Ac], prototype_j[0:Ac]) t += (d_i + d_j) / d_ij s.append(t / len(clusters)) return max(s) elif not Ar: s = [] for i, cluster_i in enumerate(clusters.values()): prototype_i = prototypes[i] d_i = diameter(subsetNum, subsetCat, cluster_i, CatAtr, NumAtr, prototype_i, clusters_numb) t = 0 for j, cluster_j in enumerate(clusters.values()): if i != j: prototype_j = prototypes[j] d_j = diameter(subsetNum, subsetCat, cluster_j, CatAtr, NumAtr, prototype_j, clusters_numb) d_ij = ds.hamdist(prototype_i[0:Ac], prototype_j[0:Ac]) t += (d_i + d_j) / d_ij s.append(t / len(clusters)) return max(s) elif not Ac: s = [] for i, cluster_i in enumerate(clusters.values()): prototype_i = prototypes[i] d_i = diameter(subsetNum, subsetCat, cluster_i, CatAtr, NumAtr, prototype_i, clusters_numb) t = 0 for j, cluster_j in enumerate(clusters.values()): if i != j: prototype_j = prototypes[j] d_j = diameter(subsetNum, subsetCat, cluster_j, CatAtr, NumAtr, prototype_j, clusters_numb) d_ij = (ds.Eucdist(prototype_i[Ac:A], prototype_j[Ac:A])) t += (d_i + d_j) / d_ij s.append(t / len(clusters)) return max(s)