def chooseInitialCentroids(data_set,CatAtr,NumAtr,clust_numb):
    subsetCat = data_set[CatAtr].as_matrix()
    subsetNum = data_set[NumAtr].as_matrix()
    U = data_set.shape[0]
    D = np.zeros(shape = (U,U))

    Ar = len(NumAtr)
    Ac = len(CatAtr)
    A = Ar+Ac
    
    
    if Ar and Ac:
        for i in range(U):
            for j in range(U):
                D[i][j] =  (Ar/A)*ds.Eucdist(subsetNum[i],subsetNum[j]) + (Ac/A)*ds.hamdist(subsetCat[i],subsetCat[j])
    elif not Ar:
        for i in range(U):
            for j in range(U):
                D[i][j] = ds.hamdist(subsetCat[i],subsetCat[j])
    elif not Ac:
        for i in range(U):
            for j in range(U):
                D[i][j] =  ds.Eucdist(subsetNum[i],subsetNum[j])
    
    
    centers = []
    remembered_indexes = [i for i in range(U)]
    step = int(U/clust_numb)
    for i in range(clust_numb):
        sums_axis = []
        sums_axis = list(np.sum(D,axis = 0))        
        max_ind =  sums_axis.index(max(sums_axis))
        max_ind_real = remembered_indexes[max_ind] 
        centers.append(max_ind_real)

        
        
        max_ind_row = list(D[max_ind,:])
        
        
        for j in range(step):
           
            min_ind_inner =  max_ind_row.index(min(max_ind_row))
            D = np.delete(D, (min_ind_inner), axis=0)
            D = np.delete(D, (min_ind_inner), axis=1)
            remembered_indexes.pop(min_ind_inner)
            max_ind_row.pop(min_ind_inner)       
            
  
    return centers
    
def dunnIndex(data_set, CatAtr, NumAtr, clusters):
    clusters_numb = len(clusters)
    subsetCat = data_set[CatAtr].as_matrix()
    subsetNum = data_set[NumAtr].as_matrix()
    U = data_set.shape[0]
    D = np.zeros(shape=(U, U))
    Ar = len(NumAtr)
    Ac = len(CatAtr)
    A = Ar + Ac

    if Ar and Ac:
        for i in range(U):
            for j in range(U):
                D[i][j] = (Ar / A) * ds.Eucdist(subsetNum[i], subsetNum[j]) + (
                    Ac / A) * ds.hamdist(subsetCat[i], subsetCat[j])
    elif not Ar:
        for i in range(U):
            for j in range(U):
                D[i][j] = ds.hamdist(subsetCat[i], subsetCat[j])
    elif not Ac:
        for i in range(U):
            for j in range(U):
                D[i][j] = ds.Eucdist(subsetNum[i], subsetNum[j])

    nc = len(clusters)
    interClust = np.zeros(shape=(nc, nc))
    intraClust = np.zeros(shape=(1, nc))

    interClust = np.empty((
        nc,
        nc,
    ))
    interClust[:] = np.NAN

    for i in range(nc):
        c1 = clusters[i]
        for j in range(nc):
            if j == i:
                D_sub = D[c1, :]
                D_sub = D_sub[:, c1]
                intraClust[0][i] = np.max(np.max(D_sub, axis=0))
            if j > i:
                c2 = clusters[j]
                D_sub_j = D[c1, :]
                D_sub_j = D_sub_j[:, c2]
                interClust[i][j] = np.min(np.min(D_sub_j, axis=0))
    return np.nanmin(interClust) / np.max(intraClust)
Example #3
0
def diameter(subsetNum, subsetCat, cluster_indexes, CatAtr, NumAtr, prototype,
             clust_numb):
    Ar = len(NumAtr)
    Ac = len(CatAtr)
    A = Ar + Ac
    ss = 0
    if Ar and Ac:
        for x in cluster_indexes:
            ss += (Ar / A) * ds.Eucdist(subsetNum[x], prototype[Ac:A]) + (
                Ac / A) * ds.hamdist(subsetCat[x], prototype[0:Ac])
        return ss / len(cluster_indexes)

    elif not Ar:
        for x in cluster_indexes:
            ss += ds.hamdist(subsetCat[x], prototype[0:Ac])
        return ss / len(cluster_indexes)
    elif not Ac:
        for x in cluster_indexes:
            ss += ds.Eucdist(subsetNum[x], prototype[Ac:A])
        return ss / len(cluster_indexes)
def Kprototypes(dataset,CatAtr,NumAtr,Z,z,x):
    subsetCat = dataset[CatAtr].as_matrix()
    subsetNum = dataset[NumAtr].as_matrix()
    
    Ar = len(NumAtr)
    Ac = len(CatAtr)
    A = Ar+Ac
    SN = 0
    SD = 0
    if Ar and Ac:
        for key,prototype in Z.items():
            SN += ds.Eucdist(subsetNum[x],prototype[Ac:A])
            SD += ds.hamdist(subsetCat[x],prototype[0:Ac])
        return((Ar/(A*SN))*ds.Eucdist(subsetNum[x],z[Ac:A])+(Ac/(A*SD))*ds.hamdist(subsetCat[x],z[0:Ac]))
    elif not Ar:
        for key,prototype in Z.items():
            SD += ds.hamdist(subsetCat[x],prototype[0:Ac])
        return((Ac/(A*SD))*ds.hamdist(subsetCat[x],z[0:Ac]))
    elif not Ac:
        for key,prototype in Z.items():
            SN += ds.Eucdist(subsetNum[x],prototype[0:Ar])
        return((Ar/(A*SN))*ds.Eucdist(subsetNum[x],z[0:Ar])) 
Example #5
0
def BE_C(data_set, cluster1, cluster2, clusters, CatAtr):
    '''
    Two clusters validation index for categorical attrib
    p. 5, eq. 16
    '''
    subset1 = data_set[CatAtr].values[clusters[cluster1]]
    subset2 = data_set[CatAtr].values[clusters[cluster2]]
    N1 = subset1.shape[0]
    N2 = subset2.shape[0]
    s = 0
    for i in range(N1):
        for j in range(N2):
            s += ds.hamdist(subset1[i], subset2[j])
    return s / (N1 * N2)
Example #6
0
def DaviesBouldin(data_set, CatAtr, NumAtr, clusters, prototypes):
    clusters_numb = len(clusters)
    subsetCat = data_set[CatAtr].as_matrix()
    subsetNum = data_set[NumAtr].as_matrix()
    Ar = len(NumAtr)
    Ac = len(CatAtr)
    A = Ar + Ac

    if Ar and Ac:
        s = []
        for i, cluster_i in enumerate(clusters.values()):
            prototype_i = prototypes[i]
            d_i = diameter(subsetNum, subsetCat, cluster_i, CatAtr, NumAtr,
                           prototype_i, clusters_numb)
            t = 0
            for j, cluster_j in enumerate(clusters.values()):
                if i != j:
                    prototype_j = prototypes[j]
                    d_i = diameter(subsetNum, subsetCat, cluster_i, CatAtr,
                                   NumAtr, prototype_i, clusters_numb)
                    d_j = diameter(subsetNum, subsetCat, cluster_j, CatAtr,
                                   NumAtr, prototype_j, clusters_numb)
                    d_ij = (Ar / A) * ds.Eucdist(
                        prototype_i[Ac:A],
                        prototype_j[Ac:A]) + (Ac / A) * ds.hamdist(
                            prototype_i[0:Ac], prototype_j[0:Ac])
                    t += (d_i + d_j) / d_ij
            s.append(t / len(clusters))
        return max(s)

    elif not Ar:
        s = []
        for i, cluster_i in enumerate(clusters.values()):
            prototype_i = prototypes[i]
            d_i = diameter(subsetNum, subsetCat, cluster_i, CatAtr, NumAtr,
                           prototype_i, clusters_numb)
            t = 0
            for j, cluster_j in enumerate(clusters.values()):
                if i != j:
                    prototype_j = prototypes[j]
                    d_j = diameter(subsetNum, subsetCat, cluster_j, CatAtr,
                                   NumAtr, prototype_j, clusters_numb)
                    d_ij = ds.hamdist(prototype_i[0:Ac], prototype_j[0:Ac])
                    t += (d_i + d_j) / d_ij
            s.append(t / len(clusters))
        return max(s)
    elif not Ac:
        s = []
        for i, cluster_i in enumerate(clusters.values()):
            prototype_i = prototypes[i]
            d_i = diameter(subsetNum, subsetCat, cluster_i, CatAtr, NumAtr,
                           prototype_i, clusters_numb)
            t = 0
            for j, cluster_j in enumerate(clusters.values()):
                if i != j:
                    prototype_j = prototypes[j]
                    d_j = diameter(subsetNum, subsetCat, cluster_j, CatAtr,
                                   NumAtr, prototype_j, clusters_numb)
                    d_ij = (ds.Eucdist(prototype_i[Ac:A], prototype_j[Ac:A]))
                    t += (d_i + d_j) / d_ij
            s.append(t / len(clusters))
        return max(s)