def os(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) numerator = 0.0 for k in range(0, n_clusters): for i in range(0, len(labels)): if labels[i] != k: continue numerator += ov(X, labels, X[i], k) denominator = 0.0 for k in range(0, n_clusters): l = [] for i in range(0, len(labels)): if labels[i] != k: continue l.append(utils.euclidian_dist(X[i], centroids[k])) # get sum of 0.1*|Ck| largest elements acc = 0.0 max_n = heapq.nlargest(int(math.ceil(0.1 * cluster_sizes[k])), l) for i in range(0, len(max_n)): acc += max_n[i] denominator += acc * 10.0 / cluster_sizes[k] return -numerator / denominator
def dunn43(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) rows, colums = X.shape point_in_c = [0] * n_clusters for i in range(0, len(labels)): point_in_c[labels[i]] += 1 dl = [0.0] * n_clusters d = np.array(dl) minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster centres_l = [[0.0] * n_clusters] * n_clusters centers = np.array(centres_l) for i in range(0, n_clusters): for j in range(0, n_clusters): centers[i][j] = utils.euclidian_dist(centroids[i], centroids[j]) for i in range(0, rows): for j in range(0, rows): if labels[i] != labels[j]: dist = centers[labels[i]][labels[j]] minimum_dif_c = min(dist, minimum_dif_c) else: d[labels[i]] += utils.euclidian_dist(X[i], centroids[labels[i]]) for i in range(0, n_clusters): d[i] /= point_in_c[i] d[i] += 2.0 maximum_same_c = max(d[i], maximum_same_c) return -minimum_dif_c / maximum_same_c
def find(self, X, labels, n_clusters): self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) self.diameter = utils.find_diameter(X) self.cluster_sizes = [] self.distances = [] self.s_c = 0 self.n_w = 0 rows, colums = X.shape for i in range(rows - 1): for j in range(i + 1, rows): if labels[i] == labels[j]: self.s_c += utils.euclidian_dist(X[i], X[j]) self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) for k in range(0, n_clusters): self.n_w += self.cluster_sizes[k] * (self.cluster_sizes[k] - 1) / 2 for i in range(0, len(labels) - 1): for j in range(i + 1, len(labels)): self.distances.append(utils.euclidian_dist(X[i], X[j])) self.s_min = heapq.nsmallest(int(self.n_w), self.distances) self.s_max = heapq.nlargest(int(self.n_w), self.distances) #ones = [1] * int(self.n_w) #s_min_c = np.dot(self.s_min, np.transpose(ones)) #s_max_c = np.dot(self.s_max, np.transpose(ones)) s_min_c = sum(self.s_min) s_max_c = sum(self.s_max) return (self.s_c - s_min_c) / (s_max_c - s_min_c)
def cs_index(X, labels, n_clusters): elements, ignore_columns = X.shape centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) max_dists = [sys.float_info.min] * elements for i in range(0, elements): # for every element for j in range(i, elements - 1): # for every other if labels[i] != labels[j]: continue # if they are in the same cluster # update the distance to the farthest element in the same cluster max_dists[i] = max(max_dists[i], utils.euclidian_dist(X[i], X[j])) # max_dists contain for each element the farthest the his cluster numerator = 0.0 for i in range(0, elements): numerator += max_dists[i] / cluster_sizes[labels[i]] denominator = 0.0 for i in range(0, n_clusters): min_centroids_dist = sys.float_info.max for j in range(i + 1, n_clusters): min_centroids_dist = min( utils.euclidian_dist(centroids[i], centroids[j]), min_centroids_dist) denominator += min_centroids_dist assert denominator != 0.0 return numerator / denominator
def find(self, X, labels, n_clusters): self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) self.dists = [[0. for _ in range(len(labels))] for _ in range(len(labels))] self.sums = [0 for _ in range(n_clusters)] rows, colums = X.shape self.point_in_c = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.delta_l = [[0.0] * n_clusters] * n_clusters self.delta = np.array(self.delta_l) self.centroid_dists = [0 for _ in range(len(labels))] #self.centroid_dists = [utils.euclidian_dist(X[i], self.centroids[labels[i]]) for i in range(len(X))] minimum_dif_c = sys.float_info.max for i in range(len(labels)): self.centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]]) self.sums[labels[i]] += self.centroid_dists[i] for i in range(rows - 1): for j in range(i + 1, rows): self.dists[i][j] = utils.euclidian_dist(X[i], X[j]) self.dists[j][i] = self.dists[i][j] if labels[i] != labels[j]: self.delta[labels[i]][labels[j]] += self.dists[i][j] for i in range(n_clusters): for j in range(n_clusters): self.delta[i][j] /= float(self.point_in_c[i] * self.point_in_c[j]) if self.delta[i][j] != 0: minimum_dif_c = min(minimum_dif_c, self.delta[i][j]) self.sums[i] *= (2 / self.point_in_c[i]) #print(max(self.sums)) return -(minimum_dif_c / max(self.sums))
def sv(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) numerator = 0.0 for k in range(0, n_clusters - 1): min_dist = sys.float_info.max for l in range(k + 1, n_clusters): min_dist = min(min_dist, utils.euclidian_dist(centroids[k], centroids[l])) numerator += min_dist denominator = 0.0 for k in range(0, n_clusters): list = [] for i in range(0, len(labels)): if labels[i] != k: continue list.append(utils.euclidian_dist(X[i], centroids[k])) # get sum of 0.1*|Ck| largest elements acc = 0.0 max_n = heapq.nlargest(int(math.ceil(0.1 * cluster_sizes[k])), list) for i in range(0, len(max_n)): acc += max_n[i] denominator += acc * 10.0 / cluster_sizes[k] return -numerator / denominator
def find(self, X, labels, n_clusters): self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.diameter = utils.find_diameter(X) self.dists = [[0. for _ in range(len(labels))] for _ in range(len(labels))] self.dist_same_c = [] rows, colums = X.shape delta_l = [[0.0] * n_clusters] * n_clusters self.delta = np.array(delta_l) minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster for i in range(rows - 1): for j in range(i + 1, rows): self.dists[i][j] = utils.euclidian_dist(X[i], X[j]) self.dists[j][i] = self.dists[i][j] if labels[i] != labels[j]: self.delta[labels[i]][labels[j]] += self.dists[i][j] else: self.dist_same_c.append([i, j]) maximum_same_c = max(self.dists[i][j], maximum_same_c) for i in range(n_clusters - 1): for j in range(i + 1, n_clusters): self.delta[i][j] /= float(self.cluster_sizes[i] * self.cluster_sizes[j]) if self.delta[i][j] != 0: minimum_dif_c = min(minimum_dif_c, self.delta[i][j]) return -minimum_dif_c / maximum_same_c
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) rows, colums = X.shape self.sums = [0 for _ in range(n_clusters)] minimum_dif_c = sys.float_info.max # min dist in different clusters centres_l = [[sys.float_info.max] * n_clusters] * n_clusters self.centers = np.array(centres_l) self.centroid_dists = [0 for _ in range(len(labels))] # self.centroid_dists = [utils.euclidian_dist(X[i], self.centroids[labels[i]]) for i in range(len(X))] for i in range(len(labels)): self.centroid_dists[i] = utils.euclidian_dist( X[i], self.centroids[labels[i]]) self.sums[labels[i]] += self.centroid_dists[i] for i in range(n_clusters): for j in range(n_clusters): if i != j: self.centers[i][j] = utils.euclidian_dist( self.centroids[i], self.centroids[j]) for i in range(rows): for j in range(rows): if labels[i] != labels[j]: dist = self.centers[labels[i]][labels[j]] minimum_dif_c = min(dist, minimum_dif_c) denominator = list(self.sums) for i in range(n_clusters): denominator[i] *= (2 / self.cluster_sizes[i]) return -(minimum_dif_c / max(denominator))
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.centroid_dists = [[sys.float_info.max for _ in range(n_clusters)] for _ in range(n_clusters)] self.dists = [[0 for _ in range(len(labels))] for _ in range(n_clusters)] numerator = 0.0 for k in range(0, n_clusters - 1): for l in range(k + 1, n_clusters): self.centroid_dists[k][l] = utils.euclidian_dist(self.centroids[k], self.centroids[l]) self.centroid_dists[l][k] = self.centroid_dists[k][l] for i in range(n_clusters): min_dist = np.amin(self.centroid_dists[i]) numerator += min_dist denominator = 0.0 for k in range(n_clusters): for i in range(len(labels)): if labels[i] != k: continue self.dists[k][i] = utils.euclidian_dist(X[i], self.centroids[k]) for k in range(n_clusters): # get sum of 0.1*|Ck| largest elements acc = 0.0 max_n = heapq.nlargest(int(math.ceil(0.1 * self.cluster_sizes[k])), self.dists[k]) for i in range(0, len(max_n)): acc += max_n[i] denominator += acc * 10.0 / self.cluster_sizes[k] return -(numerator / denominator)
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.dist_same_c = [] rows, colums = X.shape self.dists = [[0. for _ in range(rows)] for _ in range(rows)] minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster centres_l = [[sys.float_info.max] * n_clusters] * n_clusters self.centers = np.array(centres_l) for i in range(n_clusters): for j in range(n_clusters): if i != j: self.centers[i][j] = utils.euclidian_dist( self.centroids[i], self.centroids[j]) for i in range(rows - 1): for j in range(i + 1, rows): self.dists[i][j] = utils.euclidian_dist(X[i], X[j]) self.dists[j][i] = self.dists[i][j] if labels[i] != labels[j]: dist = self.centers[labels[i]][labels[j]] minimum_dif_c = min(dist, minimum_dif_c) else: self.dist_same_c.append([i, j]) maximum_same_c = max(self.dists[i][j], maximum_same_c) return -(minimum_dif_c / maximum_same_c)
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.s_clusters = [0. for _ in range(n_clusters)] self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) db = 0 self.points_in_clusters = cluster_centroid.count_cluster_sizes( labels, n_clusters) for i in range(n_clusters): self.s_clusters[i] = self.s(X, i, self.points_in_clusters, labels, self.centroids) self.sums = [[0 for _ in range(n_clusters)] for _ in range(n_clusters)] for i in range(0, n_clusters): for j in range(0, n_clusters): if i != j: tm = utils.euclidian_dist(self.centroids[i], self.centroids[j]) if tm != 0: self.sums[i][j] = (self.s_clusters[i] + self.s_clusters[j]) / tm else: pass #a = -Constants.bad_cluster tmp = np.amax(self.sums[i]) db += tmp db /= float(n_clusters) return db
def dunn53(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) rows, colums = X.shape dl = [0.0] * n_clusters d = np.array(dl) point_in_c = [0] * n_clusters for i in range(0, len(labels)): point_in_c[labels[i]] += 1 delta_l = [[0.0] * n_clusters] * n_clusters delta = np.array(delta_l) minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster for i in range(0, int(math.ceil(float(rows) / 2.0))): for j in range(0, rows): if (labels[i] != labels[j]): delta[labels[i]][labels[j]] += ( utils.euclidian_dist(X[i], centroids[labels[i]]) + utils.euclidian_dist(X[j], centroids[labels[j]])) else: d[labels[i]] += utils.euclidian_dist(X[i], centroids[labels[i]]) for i in range(0, n_clusters): d[i] /= point_in_c[i] d[i] += 2.0 maximum_same_c = max(d[i], maximum_same_c) for j in range(0, n_clusters): delta[i][j] /= float(point_in_c[i] + point_in_c[j]) minimum_dif_c = min(minimum_dif_c, delta[i][j]) return -minimum_dif_c / maximum_same_c
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.dists = [[0 for _ in range(len(labels))] for _ in range(len(labels))] self.centroid_dists = [0 for _ in range(len(labels))] self.delta = [[0 for _ in range(n_clusters)] for _ in range(n_clusters)] minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster self.sums = [0 for _ in range(n_clusters)] for i in range(len(labels)): self.centroid_dists[i] = utils.euclidian_dist( X[i], self.centroids[labels[i]]) self.sums[labels[i]] += self.centroid_dists[i] for i in range(len(labels) - 1): for j in range(i + 1, len(labels)): self.dists[i][j] = utils.euclidian_dist(X[i], X[j]) self.dists[j][i] = self.dists[i][j] self.dists_same_c.append([i, j]) maximum_same_c = max(self.dists[i][j], maximum_same_c) for i in range(n_clusters): for j in range(n_clusters): if i != j: self.delta[i][j] = (self.sums[i] + self.sums[j]) / float( self.cluster_sizes[i] + self.cluster_sizes[j]) minimum_dif_c = min(minimum_dif_c, self.delta[i][j]) return -(minimum_dif_c / maximum_same_c)
def sf(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) bcd = bcd_score(X, labels, n_clusters, centroids, cluster_sizes) wcd = wcd_score(X, labels, n_clusters, centroids, cluster_sizes) p = math.exp(-bcd - wcd) #????? return -(1.0 - 1.0 / math.exp(p))
def sym(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) numerator = sys.float_info.min for k in range(0, n_clusters - 1): for l in range(k, n_clusters): numerator = max(numerator, utils.euclidian_dist(centroids[k], centroids[l])) denominator = 0.0 for i in range(0, len(labels)): denominator += d_ps(X, labels, X[i], labels[i], centroids) return -(numerator / denominator / n_clusters)
def sym_db(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) db = 0 cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) max_fraction = sys.float_info.min for k in range(0, n_clusters): for l in range(0, n_clusters): if k != l: fraction = ((sym_s(X, labels, k, cluster_sizes, centroids) + sym_s(X, labels, l, cluster_sizes, centroids)) / utils.euclidian_dist(centroids[k], centroids[l])) max_fraction = max(max_fraction, fraction) db += max_fraction db /= float(n_clusters) return db
def find(self, X, labels, n_clusters): self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.max_s_sum = [[sys.float_info.min for _ in range(n_clusters)] for _ in range(n_clusters)] self.min_centroids_dist = [[sys.float_info.max for _ in range(n_clusters)] for _ in range(n_clusters)] self.s_clusters = [0 for _ in range(n_clusters)] self.diameter = utils.find_diameter(X) for i in range(n_clusters): self.s_clusters[i] = self.s(X, i, self.cluster_sizes, labels, self.centroids) numerator = 0.0 for k in range(0, n_clusters): for l in range(k + 1, n_clusters): self.max_s_sum[k][l] = self.s_clusters[k] + self.s_clusters[l] self.min_centroids_dist[k][l] = utils.euclidian_dist(self.centroids[k], self.centroids[l]) numerator += np.max(self.max_s_sum[k]) / np.min(self.min_centroids_dist[k]) return numerator / n_clusters
def db_star_index(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) numerator = 0.0 for k in range(0, n_clusters): max_s_sum = sys.float_info.min min_centroids_dist = sys.float_info.max for l in range(k + 1, n_clusters): max_s_sum = max( max_s_sum, s(X, k, cluster_sizes, labels, centroids) + s(X, l, cluster_sizes, labels, centroids)) min_centroids_dist = min( min_centroids_dist, utils.euclidian_dist(centroids[k], centroids[l])) numerator += max_s_sum / min_centroids_dist return numerator / n_clusters
def get_nearest_centroids(self): row, column = self.X.shape centroids_numbers, centroid_distances = [], [] for i in range(row): centroid_distances.append(sys.float_info.max) centroids_numbers.append(0) default_centroids = cluster_centroid.cluster_centroid( self.X, self.labels, self.n_clusters) for i in range(len(self.X)): for j in range(len(default_centroids)): distance = euclidian_dist(self.X[i], default_centroids[j]) if (distance <= centroid_distances[j]): centroid_distances[i] = distance centroids_numbers[i] = j return centroids_numbers, centroid_distances
def davies_bouldin(X, n_clusters, labels): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) db = 0 point_in_c = cluster_centroid.count_cluster_sizes(labels, n_clusters) tmp = sys.float_info.min for i in range(0, n_clusters): for j in range(0, n_clusters): if i != j: tm = utils.euclidian_dist(centroids[i], centroids[j]) if tm != 0: a = (s(X, i, point_in_c, labels, centroids) + s(X, j, point_in_c, labels, centroids)) / tm else: pass #a = -Constants.bad_cluster tmp = max(tmp, a) db += tmp db /= float(n_clusters) return db
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.numerators = [0.0] * n_clusters for i in range(0, len(labels)): self.numerators[labels[i]] += utils.euclidian_dist( X[i], self.centroids[labels[i]]) self.inner_max_dists = [[0 for _ in range(len(labels))] for _ in range(len(labels))] self.outer_min_dists = [[ sys.float_info.max for _ in range(len(labels)) ] for _ in range(n_clusters)] self.accumulator = [0 for _ in range(n_clusters)] for k in range(0, n_clusters): for i in range(len(labels)): # iterate elements outside cluster if labels[i] == k: continue for j in range(len(labels)): # iterate inside cluster if labels[j] != k: continue self.inner_max_dists[i][j] = utils.euclidian_dist( X[i], X[j]) self.inner_max_dists[j][i] = self.inner_max_dists[i][j] for c in range(n_clusters): for i in range(len(labels)): if labels[i] == c: continue inner_max_dist = 0 for j in range(len(self.inner_max_dists[i])): if labels[j] == c: inner_max_dist = max(inner_max_dist, self.inner_max_dists[i][j]) if inner_max_dist != 0: self.outer_min_dists[c][i] = inner_max_dist outer_min_dist = np.amin(self.outer_min_dists[c]) self.accumulator[c] = self.numerators[c] / outer_min_dist return sum(self.accumulator) / len(labels)
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.dist_centroids = [[0 for _ in range(n_clusters)] for _ in range(n_clusters)] self.dist_ps = [0 for _ in range(len(labels))] self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) for k in range(0, n_clusters - 1): for l in range(k + 1, n_clusters): self.dist_centroids[k][l] = utils.euclidian_dist( self.centroids[k], self.centroids[l]) numerator = np.amax(self.dist_centroids) for i in range(0, len(labels)): self.dist_ps[i] = utils.d_ps(X, labels, X[i], labels[i], self.centroids) denominator = sum(self.dist_ps) return -(numerator / (denominator * n_clusters))
def dunn41(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) rows, colums = X.shape minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster centres_l = [[0.0] * n_clusters] * n_clusters centers = np.array(centres_l) for i in range(0, n_clusters): for j in range(0, n_clusters): centers[i][j] = utils.euclidian_dist(centroids[i], centroids[j]) for i in range(0, int(math.ceil(float(rows) / 2.0))): for j in range(0, rows): if (labels[i] != labels[j]): dist = centers[labels[i]][labels[j]] minimum_dif_c = min(dist, minimum_dif_c) else: dist = utils.euclidian_dist(X[i], X[j]) maximum_same_c = max(dist, maximum_same_c) return -minimum_dif_c / maximum_same_c
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) rows, colums = X.shape self.dist = [[0. for _ in range(rows)] for _ in range(rows)] self.dist_dif_c = [] self.dist_same_c = [] minimum_dif_c = sys.float_info.max # min self.dist in different clusters maximum_same_c = sys.float_info.min # max self.dist in the same cluster for i in range(rows - 1): for j in range(i + 1, rows): self.dist[i][j] = utils.euclidian_dist(X[i], X[j]) self.dist[j][i] = self.dist[i][j] if labels[i] != labels[j]: self.dist_dif_c.append([i, j]) minimum_dif_c = min(self.dist[i][j], minimum_dif_c) else: self.dist_same_c.append([i, j]) maximum_same_c = max(self.dist[i][j], maximum_same_c) return -(minimum_dif_c / maximum_same_c)
def s_dbw(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) sigmas = 0.0 for k in range(0, n_clusters): sigmas += normed_cluster_sigma(X, labels, k) sigmas /= n_clusters sigmas /= normed_sigma(X) print(sigmas) stdev_val = stdev(X, labels, n_clusters) print(stdev_val) dens = 0.0 for k in range(0, n_clusters): for l in range(0, n_clusters): dens += den2(X, labels, centroids, k, l, stdev_val) /\ max(den1(X, labels, centroids, k, stdev_val), den1(X, labels, centroids, l, stdev_val)) dens /= n_clusters * (n_clusters - 1) return sigmas + dens
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.centroid_dists = [0 for _ in range(len(labels))] self.delta = [[0 for _ in range(n_clusters)] for _ in range(n_clusters)] minimum_dif_c = sys.float_info.max # min dist in different clusters self.sums = [0 for _ in range(n_clusters)] for i in range(len(labels)): self.centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]]) self.sums[labels[i]] += self.centroid_dists[i] for i in range(n_clusters): for j in range(n_clusters): if i != j: self.delta[i][j] = (self.sums[i] + self.sums[j]) / float(self.cluster_sizes[i] + self.cluster_sizes[j]) minimum_dif_c = min(minimum_dif_c, self.delta[i][j]) denominator = list(self.sums) #print(denominator) for i in range(n_clusters): denominator[i] *= (2 / self.cluster_sizes[i]) return -(minimum_dif_c / max(denominator))
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.dists = [[0 for _ in range(len(labels))] for _ in range(n_clusters)] self.dists_e = [[0 for _ in range(len(labels))] for _ in range(len(labels))] self.dists_for_b = [0 for _ in range(len(labels))] self.max_b_ss = [0 for _ in range(len(labels))] self.b_ss_size = [0 for _ in range(len(labels))] for i in range(len(labels)): for j in range(len(labels)): self.dists_e[i][j] = utils.euclidian_dist(X[i], X[j]) self.dists_e[j][i] = self.dists_e[i][j] self.a_ss = [0 for _ in range(len(labels))] self.b_ss = [0 for _ in range(len(labels))] for i in range(len(labels)): self.a_ss[i] = self.a(X, labels, i, labels[i]) self.b_ss[i] = self.b(X, labels, i, labels[i]) numerator = 0.0 for k in range(n_clusters): for i in range(len(labels)): if labels[i] != k: continue numerator += self.ov(i) denominator = 0.0 for k in range(n_clusters): for i in range(len(labels)): if labels[i] != k: continue self.dists[k][i] = utils.euclidian_dist(X[i], self.centroids[k]) for k in range(n_clusters): # get sum of 0.1*|Ck| largest elements acc = 0.0 max_n = heapq.nlargest(int(math.ceil(0.1 * self.cluster_sizes[k])), self.dists[k]) for i in range(0, len(max_n)): acc += max_n[i] denominator += acc * 10.0 / self.cluster_sizes[k] return -(numerator / denominator)
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) self.dist_ps = [0 for _ in range(len(labels))] self.sym_s_clusters = [0 for _ in range(n_clusters)] self.fractions = [[0 for _ in range(n_clusters)] for _ in range(n_clusters)] self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) db = 0 self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) for i in range(0, len(labels)): self.dist_ps[i] = utils.d_ps(X, labels, X[i], labels[i], self.centroids) for i in range(n_clusters): self.sym_s_clusters[i] = self.sym_s(X, labels, i, self.cluster_sizes, self.centroids) for k in range(0, n_clusters): for l in range(0, n_clusters): if k != l: self.fractions[k][l] = ((self.sym_s_clusters[k] + self.sym_s_clusters[l]) / utils.euclidian_dist(self.centroids[k], self.centroids[l])) for k in range(n_clusters): max_fraction = np.amax(self.fractions[k]) db += max_fraction db /= float(n_clusters) return db
def cop(X, labels, n_clusters): centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters) numerators = [0.0] * n_clusters for i in range(0, len(labels)): numerators[labels[i]] += utils.euclidian_dist(X[i], centroids[labels[i]]) accumulator = 0.0 for k in range(0, n_clusters): outer_min_dist = sys.float_info.max for i in range(0, len(labels)): # iterate elements outside cluster if labels[i] == k: continue inner_max_dist = sys.float_info.min for j in range(i, len(labels)): # iterate inside cluster if labels[j] != k: continue inner_max_dist = max(inner_max_dist, utils.euclidian_dist(X[i], X[j])) if inner_max_dist != sys.float_info.min: # TODO: there are cases, when inner_max_dist is not updated in iner loop. why? outer_min_dist = min(outer_min_dist, inner_max_dist) accumulator += numerators[k] / outer_min_dist return accumulator / len(labels)
def find(self, X, labels, n_clusters): self.diameter = utils.find_diameter(X) elements, ignore_columns = X.shape self.centroids = cluster_centroid.cluster_centroid( X, labels, n_clusters) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.dists = [[0 for _ in range(elements)] for _ in range(elements)] for i in range(0, elements - 1): # for every element for j in range(i + 1, elements): # for every other if labels[i] != labels[j]: continue # if they are in the same cluster # update the distance to the farthest element in the same cluster self.dists[i][j] = utils.euclidian_dist(X[i], X[j]) self.dists[j][i] = self.dists[i][j] # max_self.dists contain for each element the farthest the his cluster numerator = 0.0 for i in range(0, elements): max_dist = np.amax(self.dists[i]) numerator += max_dist / self.cluster_sizes[labels[i]] denominator = 0.0 self.centroids_dist = [[sys.float_info.max for _ in range(n_clusters)] for _ in range(n_clusters)] for i in range(n_clusters): for j in range(n_clusters): if i != j: self.centroids_dist[i][j] = utils.euclidian_dist( self.centroids[i], self.centroids[j]) for i in range(n_clusters): min_centroid_dist = np.amin(self.centroids_dist[i]) denominator += min_centroid_dist return numerator / denominator