def update(self, X, n_clusters, labels, k, l, id): self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.cluster_sizes), X[id], k, l) #self.cluster_sizes[k] -= 1 #self.cluster_sizes[l] += 1 for i in range(len(labels)): if labels[i] == k: self.s_c -= utils.euclidian_dist(X[i], X[id]) if labels[i] == l: self.s_c += utils.euclidian_dist(X[i], X[id]) prev_n_w = self.n_w self.n_w = self.n_w - (self.cluster_sizes[k] + 1) * self.cluster_sizes[k] / 2 + self.cluster_sizes[k] * (self.cluster_sizes[k] - 1) / 2 \ - (self.cluster_sizes[l] - 1) * (self.cluster_sizes[l] - 2) / 2 + self.cluster_sizes[l] * (self.cluster_sizes[l] - 1) / 2 delta = 0.1 #print(prev_n_w) #print(self.n_w) #print(delta * len(labels)) if abs(self.n_w - prev_n_w) > delta * len(labels): self.s_min = heapq.nsmallest(int(self.n_w), self.distances) self.s_max = heapq.nlargest(int(self.n_w), self.distances) #ones = [1] * int(self.n_w) #s_min_c = np.dot(self.s_min, np.transpose(ones)) #s_max_c = np.dot(self.s_max, np.transpose(ones)) s_min_c = sum(self.s_min) s_max_c = sum(self.s_max) return (self.s_c - s_min_c) / (s_max_c - s_min_c)
def update(self, X, n_clusters, labels, k, l, id): point = X[id] prev_centroids = np.copy(self.centroids) self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) delta = 10**(-math.log(len(X), 10) - 1) for i in range(len(labels)): if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter): self.dist_ps[i] = utils.d_ps(X, labels, X[i], labels[i], self.centroids) if utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter: self.sym_s_clusters[k] = self.sym_s(X, labels, k, self.cluster_sizes, self.centroids) if utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter: self.sym_s_clusters[l] = self.sym_s(X, labels, l, self.cluster_sizes, self.centroids) db = 0 for i in range(n_clusters): if i != k: tm = utils.euclidian_dist(self.centroids[i], self.centroids[k]) self.fractions[i][k] = (self.sym_s_clusters[i] + self.sym_s_clusters[k]) / tm self.fractions[k][i] = self.fractions[i][k] if i != l: tm = utils.euclidian_dist(self.centroids[i], self.centroids[l]) self.fractions[i][l] = (self.sym_s_clusters[i] + self.sym_s_clusters[l]) / tm self.fractions[l][i] = self.fractions[i][l] for i in range(n_clusters): tmp = np.amax(self.fractions[i]) db += tmp db /= float(n_clusters) return db
def update(self, X, n_clusters, labels, k, l, id): point = X[id] prev_centroids = np.copy(self.centroids) self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) for i in range(n_clusters): if i > k: self.centroid_dists[k][i] = utils.euclidian_dist(self.centroids[i], self.centroids[k]) self.centroid_dists[i][k] = self.centroid_dists[k][i] if i > l: self.centroid_dists[l][i] = utils.euclidian_dist(self.centroids[i], self.centroids[l]) self.centroid_dists[i][l] = self.centroid_dists[l][i] numerator = 0.0 for i in range(n_clusters): min_dist = np.amin(self.centroid_dists[i]) numerator += min_dist denominator = 0.0 self.dists[k][id] = 0. delta = 10**(-math.log(len(X), 10) - 1) for i in range(len(labels)): if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter): self.dists[labels[i]][i] = utils.euclidian_dist(X[i], self.centroids[labels[i]]) for c in range(n_clusters): # get sum of 0.1*|Ck| largest elements acc = 0.0 max_n = heapq.nlargest(int(math.ceil(0.1 * self.cluster_sizes[c])), self.dists[c]) for i in range(0, len(max_n)): acc += max_n[i] denominator += acc * 10.0 / self.cluster_sizes[c] return -(numerator / denominator)
def update(self, X, n_clusters, labels, k, l, id): point = X[id] prev_centroids = np.copy(self.centroids) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.centroids = cluster_centroid.update_centroids( np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) delta = 10**(-math.log(len(X), 10) - 1) if utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter: self.sigmas[k] = self.normed_cluster_sigma(X, labels, k) if utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter: self.sigmas[l] = self.normed_cluster_sigma(X, labels, l) term1 = sum(self.sigmas) / (n_clusters * self.normed_sigma_x) stdev_val = self.stdev(n_clusters) if (utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter or utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter): self.dens = 0.0 for k in range(0, n_clusters): for l in range(0, n_clusters): self.dens += self.den2(X, labels, self.centroids, k, l, stdev_val) /\ max(self.den1(X, labels, self.centroids, k, stdev_val), self.den1(X, labels, self.centroids, l, stdev_val)) self.dens /= n_clusters * (n_clusters - 1) return (term1 + self.dens)
def update(self, X, n_clusters, labels, k, l, id): point = X[id] prev_centroids = np.copy(self.centroids) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.centroids = cluster_centroid.update_centroids( np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) for i in range(n_clusters): if i > k: self.dist_centroids[k][i] = utils.euclidian_dist( self.centroids[i], self.centroids[k]) if i > l: self.dist_centroids[l][i] = utils.euclidian_dist( self.centroids[i], self.centroids[l]) numerator = np.amax(self.dist_centroids) delta = 10**(-math.log(len(X), 10) - 1) for i in range(len(labels)): if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter): self.dist_ps[i] = utils.d_ps(X, labels, X[i], labels[i], self.centroids) denominator = sum(self.dist_ps) return -(numerator / (denominator * n_clusters))
def update(self, X, n_clusters, labels, k, l, id): point = X[id] self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.centroids = cluster_centroid.update_centroids( np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) self.numerators[k] = 0.0 self.numerators[l] = 0.0 for i in range(len(labels)): if labels[i] == k or labels[i] == l: self.numerators[labels[i]] += utils.euclidian_dist( X[i], self.centroids[labels[i]]) for i in range(len(labels)): if labels[i] == k: self.inner_max_dists[i][id] = utils.euclidian_dist(X[i], X[id]) self.inner_max_dists[id][i] = self.inner_max_dists[i][id] if labels[i] == l: self.inner_max_dists[i][id] = 0 self.inner_max_dists[id][i] = 0 self.outer_min_dists[l][id] = sys.float_info.max for c in [k, l]: for i in range(len(labels)): if labels[i] == c: continue inner_max_dist = 0 for j in range(len(self.inner_max_dists[i])): if labels[j] == c: inner_max_dist = max(inner_max_dist, self.inner_max_dists[i][j]) if inner_max_dist != 0: self.outer_min_dists[c][i] = inner_max_dist outer_min_dist = np.amin(self.outer_min_dists[c]) self.accumulator[c] = self.numerators[c] / outer_min_dist return sum(self.accumulator) / len(labels)
def update(self, X, n_clusters, labels, k, l, id): point = X[id] prev_cluster_sizes = list(self.cluster_sizes) prev_centroids = np.copy(self.centroids) self.cluster_sizes = cluster_centroid.count_cluster_sizes(np.copy(labels), n_clusters) self.centroids = cluster_centroid.update_centroids(self.centroids, self.cluster_sizes, point, k, l) minimum_dif_c = sys.float_info.max # min dist in different clusters #update numerator new_centroid_dists = list(self.centroid_dists) dell = 10**(-math.log(len(X), 10) - 1) for i in range(len(labels)): if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > dell * self.diameter or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > dell * self.diameter): new_centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]]) for i in range(n_clusters): for j in range(n_clusters): self.delta[i][j] *= (prev_cluster_sizes[i] + prev_cluster_sizes[j]) new_sums = [0 for _ in range(n_clusters)] for i in range(n_clusters): if i != k and i != l: new_sums[i] = self.sums[i] for i in range(len(labels)): if labels[i] == k or labels[i] == l: new_sums[labels[i]] += new_centroid_dists[i] for i in range(n_clusters): for j in range(n_clusters): if i != j: if self.cluster_sizes[i] + self.cluster_sizes[j] == 0: self.delta[i][j] = float('inf') else: self.delta[i][j] = (new_sums[i] + new_sums[j]) / float(self.cluster_sizes[i] + self.cluster_sizes[j]) minimum_dif_c = min(minimum_dif_c, self.delta[i][j]) #update denominator denominator = list(new_sums) #print(denominator) for i in range(n_clusters): if self.cluster_sizes[i] == 0: denominator[i] = float('inf') else: denominator[i] *= (2 / self.cluster_sizes[i]) return -(minimum_dif_c / max(denominator))
def update(self, X, n_clusters, labels, k, l, id): self.diameter = utils.find_diameter(X) prev_point_in_c = list(self.point_in_c) prev_centroids = np.copy(self.centroids) self.point_in_c = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.point_in_c), X[id], k, l) minimum_dif_c = sys.float_info.max # min dist in different clusters #update numerator for i in range(n_clusters): for j in range(n_clusters): self.delta[i][j] *= (prev_point_in_c[i] * prev_point_in_c[j]) for i in range(len(labels)): if labels[i] != k and id < i: self.delta[k][labels[i]] -= self.dists[id][i] if labels[i] != k and id > i: self.delta[labels[i]][k] -= self.dists[i][id] if labels[i] != l and id < i: self.delta[l][labels[i]] += self.dists[id][i] if labels[i] != l and id > i: self.delta[labels[i]][l] += self.dists[i][id] for i in range(n_clusters - 1): for j in range(i + 1, n_clusters): self.delta[i][j] /= float(self.point_in_c[i] * self.point_in_c[j]) if self.delta[i][j] != 0: minimum_dif_c = min(minimum_dif_c, self.delta[i][j]) # update denominator new_centroid_dists = list(self.centroid_dists) dell = 10 ** (-math.log(len(X), 10) - 1) for i in range(len(labels)): if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > dell * self.diameter or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > dell * self.diameter): new_centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]]) new_sums = [0 for _ in range(n_clusters)] for i in range(n_clusters): if i != k and i != l: new_sums[i] = self.sums[i] for i in range(len(labels)): if labels[i] == k or labels[i] == l: new_sums[labels[i]] += new_centroid_dists[i] denominator = list(new_sums) for i in range(n_clusters): if self.point_in_c[i] != 0: denominator[i] *= (2 / self.point_in_c[i]) return -(minimum_dif_c / max(denominator))
def update(self, X, n_clusters, labels, k, l, id): point = X[id] # prev_cluster_sizes = list(self.cluster_sizes) prev_centroids = np.copy(self.centroids) self.cluster_sizes = cluster_centroid.count_cluster_sizes( np.copy(labels), n_clusters) self.centroids = cluster_centroid.update_centroids( np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) # update denominator new_centroid_dists = list(self.centroid_dists) dell = 10**(-math.log(len(X), 10) - 1) for i in range(len(labels)): if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > dell * self.diameter or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > dell * self.diameter): new_centroid_dists[i] = utils.euclidian_dist( X[i], self.centroids[labels[i]]) new_sums = [0 for _ in range(n_clusters)] for i in range(n_clusters): if i != k and i != l: new_sums[i] = self.sums[i] for i in range(len(labels)): if labels[i] == k or labels[i] == l: new_sums[labels[i]] += new_centroid_dists[i] denominator = list(new_sums) for i in range(n_clusters): if self.cluster_sizes[i] != 0: denominator[i] *= (2 / self.cluster_sizes[i]) # update numerator for i in range(n_clusters): if i != k: self.centers[i][k] = utils.euclidian_dist( self.centroids[i], self.centroids[k]) self.centers[k][i] = self.centers[i][k] if i != l: self.centers[i][l] = utils.euclidian_dist( self.centroids[i], self.centroids[l]) self.centers[l][i] = self.centers[i][l] minimum_dif_c = np.amin(self.centers) return -(minimum_dif_c / max(denominator))
def update(self, X, n_clusters, labels, k, l, id): prev_cluster_sizes = list(self.cluster_sizes) self.centroids = cluster_centroid.update_centroids( list(self.centroids), list(self.cluster_sizes), X[id], k, l) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster delete_from_same = [] for i in range(0, len(labels)): if labels[i] == k: delete_from_same.append([i, id]) delete_from_same.append([id, i]) if labels[i] == l and i != id: self.dist_same_c.append([i, id]) self.dist_same_c.append([id, i]) for pair in self.dist_same_c: cur = self.dists[pair[0]][pair[1]] if cur > maximum_same_c: if pair not in delete_from_same: maximum_same_c = cur for i in range(n_clusters - 1): for j in range(i + 1, n_clusters): self.delta[i][j] *= (prev_cluster_sizes[i] * prev_cluster_sizes[j]) for i in range(len(labels)): if labels[i] != k and id < i: self.delta[k][labels[i]] -= self.dists[id][i] if labels[i] != k and id > i: self.delta[labels[i]][k] -= self.dists[i][id] if labels[i] != l and id < i: self.delta[l][labels[i]] += self.dists[id][i] if labels[i] != l and id > i: self.delta[labels[i]][l] += self.dists[i][id] for i in range(n_clusters - 1): for j in range(i + 1, n_clusters): self.delta[i][j] /= float(self.cluster_sizes[i] * self.cluster_sizes[j]) if self.delta[i][j] != 0: minimum_dif_c = min(minimum_dif_c, self.delta[i][j]) return -minimum_dif_c / maximum_same_c
def update(self, X, n_clusters, labels, k, l, id): self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.centroids = cluster_centroid.update_centroids( np.copy(self.centroids), np.copy(self.cluster_sizes), X[id], k, l) self.a_ss[id] = self.a(X, labels, id, l) self.b_ss[id] = self.b(X, n_clusters, labels, id, l) for i in range(len(labels)): if i == id: continue if labels[i] == k: self.a_ss[i] *= (self.cluster_sizes[k] + 1) self.a_ss[i] -= self.dists_e[i][id] if self.cluster_sizes[k] == 0: self.a_ss[i] = float('inf') else: self.a_ss[i] /= self.cluster_sizes[k] if labels[i] == l: self.a_ss[i] *= (self.cluster_sizes[l] - 1) self.a_ss[i] += self.dists_e[i][id] if self.cluster_sizes[l] == 0: self.a_ss[i] = float('inf') else: self.a_ss[i] /= self.cluster_sizes[l] self.dists_for_b[i][l] *= (self.cluster_sizes[l] - 1) self.dists_for_b[i][l] += self.dists_e[i][id] if self.cluster_sizes[l] == 0: self.dists_for_b[i][l] = float('inf') else: self.dists_for_b[i][l] /= self.cluster_sizes[l] self.dists_for_b[i][k] *= (self.cluster_sizes[k] + 1) self.dists_for_b[i][k] -= self.dists_e[i][id] if self.cluster_sizes[k] == 0: self.dists_for_b[i][k] = float('inf') else: self.dists_for_b[i][k] /= self.cluster_sizes[k] self.b_ss[i] = min(self.dists_for_b[i]) ch = 0 for i in range(len(labels)): ch += (self.b_ss[i] - self.a_ss[i]) / max(self.b_ss[i], self.a_ss[i]) return -(ch / float(len(labels)))
def update(self, X, n_clusters, labels, k, l, id): point = X[id] prev_centroids = np.copy(self.centroids) delta = 10**(-math.log(len(X), 10) - 1) self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.centroids = cluster_centroid.update_centroids(self.centroids, self.cluster_sizes, point, k, l) if utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter: self.s_clusters[k] = self.s(X, k, self.cluster_sizes, labels, self.centroids) if utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter: self.s_clusters[l] = self.s(X, l, self.cluster_sizes, labels, self.centroids) for i in range(n_clusters): if i > k: self.max_s_sum[k][i] = self.s_clusters[i] + self.s_clusters[k] self.min_centroids_dist[k][i] = utils.euclidian_dist(self.centroids[i], self.centroids[k]) if i > l: self.max_s_sum[l][i] = self.s_clusters[i] + self.s_clusters[l] self.min_centroids_dist[l][i] = utils.euclidian_dist(self.centroids[i], self.centroids[l]) numerator = 0.0 for i in range(n_clusters): numerator += np.max(self.max_s_sum[i]) / np.min(self.min_centroids_dist[i]) return numerator / n_clusters
def update(self, X, n_clusters, labels, k, l, id): point = X[id] self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.centroids = cluster_centroid.update_centroids( np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) maximum_same_c = sys.float_info.min # max dist in the same cluster delete_from_same = [] #update denominator for i in range(len(labels)): if labels[i] == k: delete_from_same.append([i, id]) delete_from_same.append([id, i]) if labels[i] == l and i != id: self.dist_same_c.append([i, id]) self.dist_same_c.append([id, i]) for pair in self.dist_same_c: cur = self.dists[pair[0]][pair[1]] if cur > maximum_same_c: if pair not in delete_from_same: maximum_same_c = cur #update numerator for i in range(n_clusters): if i != k: self.centers[i][k] = utils.euclidian_dist( self.centroids[i], self.centroids[k]) self.centers[k][i] = self.centers[i][k] if i != l: self.centers[i][l] = utils.euclidian_dist( self.centroids[i], self.centroids[l]) self.centers[l][i] = self.centers[i][l] minimum_dif_c = np.amin(self.centers) return -(minimum_dif_c / maximum_same_c)
def update(self, X, n_clusters, labels, k, l, id): point = X[id] delta = 10**(-math.log(len(X), 10) - 1) prev_centroids = np.copy(self.centroids) #self.points_in_clusters = cluster_centroid.count_cluster_sizes(labels, n_clusters) self.centroids = cluster_centroid.update_centroids( self.centroids, self.points_in_clusters, point, k, l) if utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter: self.s_clusters[k] = self.s(X, k, self.points_in_clusters, labels, self.centroids) if utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter: self.s_clusters[l] = self.s(X, l, self.points_in_clusters, labels, self.centroids) db = 0 for i in range(n_clusters): if i != k: tm = utils.euclidian_dist(self.centroids[i], self.centroids[k]) if tm != 0: self.sums[i][k] = (self.s_clusters[i] + self.s_clusters[k]) / tm self.sums[k][i] = (self.s_clusters[i] + self.s_clusters[k]) / tm if i != l: tm = utils.euclidian_dist(self.centroids[i], self.centroids[l]) if tm != 0: self.sums[i][l] = (self.s_clusters[i] + self.s_clusters[l]) / tm self.sums[l][i] = (self.s_clusters[i] + self.s_clusters[l]) / tm for i in range(n_clusters): tmp = np.amax(self.sums[i]) db += tmp db /= float(n_clusters) return db
def update(self, X, n_clusters, labels, k, l, id): point = X[id] elements, ignore_columns = X.shape self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.centroids = cluster_centroid.update_centroids( np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) for i in range(elements): if labels[i] == k: self.dists[i][id] = 0 self.dists[id][i] = 0 if labels[i] == l: self.dists[id][i] = utils.euclidian_dist(X[i], X[id]) self.dists[i][id] = self.dists[id][i] numerator = 0.0 for i in range(elements): max_dist = np.amax(self.dists[i]) numerator += max_dist / self.cluster_sizes[labels[i]] denominator = 0.0 for i in range(n_clusters): if i != k: self.centroids_dist[k][i] = utils.euclidian_dist( self.centroids[i], self.centroids[k]) self.centroids_dist[i][k] = self.centroids_dist[k][i] if i != l: self.centroids_dist[l][i] = utils.euclidian_dist( self.centroids[i], self.centroids[l]) self.centroids_dist[i][l] = self.centroids_dist[l][i] for i in range(n_clusters): min_centroid_dist = np.amin(self.centroids_dist[i]) denominator += min_centroid_dist return numerator / denominator
def update(self, X, n_clusters, labels, k, l, id): point = X[id] prev_cluster_sizes = list(self.cluster_sizes) prev_centroids = np.copy(self.centroids) self.cluster_sizes = cluster_centroid.count_cluster_sizes( labels, n_clusters) self.centroids = cluster_centroid.update_centroids( np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l) minimum_dif_c = sys.float_info.max # min dist in different clusters maximum_same_c = sys.float_info.min # max dist in the same cluster delete_from_same = [] #update denominator for i in range(0, len(labels)): if labels[i] == k: delete_from_same.append([i, id]) delete_from_same.append([id, i]) if labels[i] == l and i != id: self.dists_same_c.append([i, id]) self.dists_same_c.append([id, i]) for pair in self.dists_same_c: cur = self.dists[pair[0]][pair[1]] if cur > maximum_same_c: if pair not in delete_from_same: maximum_same_c = cur #update numerator new_centroid_dists = list(self.centroid_dists) dell = 10**(-math.log(len(X), 10) - 1) for i in range(len(labels)): if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > dell * self.diameter or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > dell * self.diameter): new_centroid_dists[i] = utils.euclidian_dist( X[i], self.centroids[labels[i]]) for i in range(n_clusters): for j in range(n_clusters): self.delta[i][j] *= (prev_cluster_sizes[i] + prev_cluster_sizes[j]) new_sums = [0 for _ in range(n_clusters)] for i in range(n_clusters): if i != k and i != l: new_sums[i] = self.sums[i] for i in range(len(labels)): if labels[i] == k or labels[i] == l: new_sums[labels[i]] += new_centroid_dists[i] for i in range(n_clusters): for j in range(n_clusters): if i != j: self.delta[i][j] = (new_sums[i] + new_sums[j]) / float( self.cluster_sizes[i] + self.cluster_sizes[j]) minimum_dif_c = min(minimum_dif_c, self.delta[i][j]) return -(minimum_dif_c / maximum_same_c)
def update(self, X, n_clusters, labels, k, l, id): point = X[id] prev_centroids = np.copy(self.centroids) prev_cluster_sizes = list(self.cluster_sizes) self.centroids = cluster_centroid.update_centroids(self.centroids, self.cluster_sizes, point, k, l) self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters) prev_dists_for_b = list(self.dists_for_b) self.a_ss[id] = self.a(X, labels, id, l) self.b_ss[id] = self.b(X, labels, id, l) for i in range(len(labels)): if i == id: continue if labels[i] == k: self.a_ss[i] *= prev_cluster_sizes[k] self.a_ss[i] -= self.dists_e[i][id] self.a_ss[i] /= self.cluster_sizes[k] self.b_ss[i] *= prev_cluster_sizes[k] j = prev_cluster_sizes[k] #if prev_dists_for_b[i][j] != float('inf'): #self.b_ss[i] -= prev_dists_for_b[i][j] self.dists_for_b[i][id] = self.dists_e[i][id] if self.max_b_ss[i] > self.dists_e[i][id]: self.b_ss[i] -= self.max_b_ss[i] self.b_ss[i] += self.dists_e[i][id] filtered = [x for x in self.dists_for_b[i] if x != float('inf')] self.max_b_ss[i] = max(filtered) elif self.b_ss_size[i] < self.cluster_sizes[k]: self.b_ss[i] += self.dists_e[i][id] self.b_ss_size[i] += 1 self.b_ss[i] /= self.cluster_sizes[k] if labels[i] == l: self.a_ss[i] *= prev_cluster_sizes[l] self.a_ss[i] += self.dists_e[i][id] self.a_ss[i] /= self.cluster_sizes[l] self.b_ss[i] *= prev_cluster_sizes[l] j = prev_cluster_sizes[l] #self.b_ss[i] += self.dists_e[i][j] self.b_ss[i] -= self.dists_e[i][id] self.b_ss_size[i] -= 1 self.dists_for_b[i][id] = float('inf') if self.b_ss_size[i] < self.cluster_sizes[l]: filtered = [x for x in self.dists_for_b[i] if x != float('inf')] new_max = max(filtered) self.b_ss[i] += new_max self.b_ss_size[i] += 1 self.max_b_ss[i] = new_max #if self.dists_for_b[i][j] < self.dists_e[i][id]: #self.b_ss[i] -= self.dists_e[i][id] # self.b_ss[i] += self.dists_for_b[i][j] #elif self.dists_for_b[i][j - 1] == float('inf'): self.b_ss[i] /= self.cluster_sizes[l] numerator = 0.0 for c in range(n_clusters): for i in range(len(labels)): if labels[i] != c: continue numerator += self.ov(i) denominator = 0.0 self.dists[k][id] = 0. #delta = 10**(-math.log(len(X), 10) - 1) delta = 10**(-9) for i in range(len(labels)): if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter): self.dists[labels[i]][i] = utils.euclidian_dist(X[i], self.centroids[labels[i]]) for c in range(n_clusters): # get sum of 0.1*|Ck| largest elements max_n = heapq.nlargest(int(math.ceil(0.1 * self.cluster_sizes[c])), self.dists[c]) denominator += sum(max_n) * 10.0 / self.cluster_sizes[c] return -(numerator / denominator)