Esempio n. 1
0
    def update(self, X, n_clusters, labels, k, l, id):
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.cluster_sizes),
                                                                                 X[id], k, l)
        #self.cluster_sizes[k] -= 1
        #self.cluster_sizes[l] += 1
        for i in range(len(labels)):
            if labels[i] == k:
                self.s_c -= utils.euclidian_dist(X[i], X[id])
            if labels[i] == l:
                self.s_c += utils.euclidian_dist(X[i], X[id])
        prev_n_w = self.n_w
        self.n_w = self.n_w - (self.cluster_sizes[k] + 1) * self.cluster_sizes[k] / 2 + self.cluster_sizes[k] * (self.cluster_sizes[k] - 1) / 2 \
                - (self.cluster_sizes[l] - 1) * (self.cluster_sizes[l] - 2) / 2 + self.cluster_sizes[l] * (self.cluster_sizes[l] - 1) / 2

        delta = 0.1
        #print(prev_n_w)
        #print(self.n_w)
        #print(delta * len(labels))

        if abs(self.n_w - prev_n_w) > delta * len(labels):
            self.s_min = heapq.nsmallest(int(self.n_w), self.distances)
            self.s_max = heapq.nlargest(int(self.n_w), self.distances)

        #ones = [1] * int(self.n_w)
        #s_min_c = np.dot(self.s_min, np.transpose(ones))
        #s_max_c = np.dot(self.s_max, np.transpose(ones))
        s_min_c = sum(self.s_min)
        s_max_c = sum(self.s_max)
        return (self.s_c - s_min_c) / (s_max_c - s_min_c)
Esempio n. 2
0
 def update(self, X, n_clusters, labels, k, l, id):
     point = X[id]
     prev_centroids = np.copy(self.centroids)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
     self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
     delta = 10**(-math.log(len(X), 10) - 1)
     for i in range(len(labels)):
         if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter
            or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter):
             self.dist_ps[i] = utils.d_ps(X, labels, X[i], labels[i], self.centroids)
     if utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter:
         self.sym_s_clusters[k] = self.sym_s(X, labels, k, self.cluster_sizes, self.centroids)
     if utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter:
         self.sym_s_clusters[l] = self.sym_s(X, labels, l, self.cluster_sizes, self.centroids)
     db = 0
     for i in range(n_clusters):
         if i != k:
             tm = utils.euclidian_dist(self.centroids[i], self.centroids[k])
             self.fractions[i][k] = (self.sym_s_clusters[i] + self.sym_s_clusters[k]) / tm
             self.fractions[k][i] = self.fractions[i][k]
         if i != l:
             tm = utils.euclidian_dist(self.centroids[i], self.centroids[l])
             self.fractions[i][l] = (self.sym_s_clusters[i] + self.sym_s_clusters[l]) / tm
             self.fractions[l][i] = self.fractions[i][l]
     for i in range(n_clusters):
         tmp = np.amax(self.fractions[i])
         db += tmp
     db /= float(n_clusters)
     return db
Esempio n. 3
0
 def update(self, X, n_clusters, labels, k, l, id):
     point = X[id]
     prev_centroids = np.copy(self.centroids)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
     self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
     for i in range(n_clusters):
         if i > k:
             self.centroid_dists[k][i] = utils.euclidian_dist(self.centroids[i], self.centroids[k])
             self.centroid_dists[i][k] = self.centroid_dists[k][i]
         if i > l:
             self.centroid_dists[l][i] = utils.euclidian_dist(self.centroids[i], self.centroids[l])
             self.centroid_dists[i][l] = self.centroid_dists[l][i]
     numerator = 0.0
     for i in range(n_clusters):
         min_dist = np.amin(self.centroid_dists[i])
         numerator += min_dist
     denominator = 0.0
     self.dists[k][id] = 0.
     delta = 10**(-math.log(len(X), 10) - 1)
     for i in range(len(labels)):
         if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter
            or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter):
             self.dists[labels[i]][i] = utils.euclidian_dist(X[i], self.centroids[labels[i]])
     for c in range(n_clusters):
         # get sum of 0.1*|Ck| largest elements
         acc = 0.0
         max_n = heapq.nlargest(int(math.ceil(0.1 * self.cluster_sizes[c])), self.dists[c])
         for i in range(0, len(max_n)):
             acc += max_n[i]
         denominator += acc * 10.0 / self.cluster_sizes[c]
     return -(numerator / denominator)
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        prev_centroids = np.copy(self.centroids)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
        delta = 10**(-math.log(len(X), 10) - 1)
        if utils.euclidian_dist(prev_centroids[k],
                                self.centroids[k]) > delta * self.diameter:
            self.sigmas[k] = self.normed_cluster_sigma(X, labels, k)
        if utils.euclidian_dist(prev_centroids[l],
                                self.centroids[l]) > delta * self.diameter:
            self.sigmas[l] = self.normed_cluster_sigma(X, labels, l)
        term1 = sum(self.sigmas) / (n_clusters * self.normed_sigma_x)
        stdev_val = self.stdev(n_clusters)

        if (utils.euclidian_dist(prev_centroids[k],
                                 self.centroids[k]) > delta * self.diameter
                or utils.euclidian_dist(prev_centroids[l], self.centroids[l]) >
                delta * self.diameter):
            self.dens = 0.0
            for k in range(0, n_clusters):
                for l in range(0, n_clusters):
                    self.dens += self.den2(X, labels, self.centroids, k, l, stdev_val) /\
                            max(self.den1(X, labels, self.centroids, k, stdev_val),
                                self.den1(X, labels, self.centroids, l, stdev_val))

        self.dens /= n_clusters * (n_clusters - 1)
        return (term1 + self.dens)
Esempio n. 5
0
 def update(self, X, n_clusters, labels, k, l, id):
     point = X[id]
     prev_centroids = np.copy(self.centroids)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(
         labels, n_clusters)
     self.centroids = cluster_centroid.update_centroids(
         np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
     for i in range(n_clusters):
         if i > k:
             self.dist_centroids[k][i] = utils.euclidian_dist(
                 self.centroids[i], self.centroids[k])
         if i > l:
             self.dist_centroids[l][i] = utils.euclidian_dist(
                 self.centroids[i], self.centroids[l])
     numerator = np.amax(self.dist_centroids)
     delta = 10**(-math.log(len(X), 10) - 1)
     for i in range(len(labels)):
         if (labels[i] == k and
                 utils.euclidian_dist(prev_centroids[k], self.centroids[k])
                 > delta * self.diameter or labels[i] == l and
                 utils.euclidian_dist(prev_centroids[l], self.centroids[l])
                 > delta * self.diameter):
             self.dist_ps[i] = utils.d_ps(X, labels, X[i], labels[i],
                                          self.centroids)
     denominator = sum(self.dist_ps)
     return -(numerator / (denominator * n_clusters))
Esempio n. 6
0
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
        self.numerators[k] = 0.0
        self.numerators[l] = 0.0
        for i in range(len(labels)):
            if labels[i] == k or labels[i] == l:
                self.numerators[labels[i]] += utils.euclidian_dist(
                    X[i], self.centroids[labels[i]])
        for i in range(len(labels)):
            if labels[i] == k:
                self.inner_max_dists[i][id] = utils.euclidian_dist(X[i], X[id])
                self.inner_max_dists[id][i] = self.inner_max_dists[i][id]
            if labels[i] == l:
                self.inner_max_dists[i][id] = 0
                self.inner_max_dists[id][i] = 0
                self.outer_min_dists[l][id] = sys.float_info.max
        for c in [k, l]:
            for i in range(len(labels)):
                if labels[i] == c:
                    continue
                inner_max_dist = 0
                for j in range(len(self.inner_max_dists[i])):
                    if labels[j] == c:
                        inner_max_dist = max(inner_max_dist,
                                             self.inner_max_dists[i][j])

                if inner_max_dist != 0:
                    self.outer_min_dists[c][i] = inner_max_dist
            outer_min_dist = np.amin(self.outer_min_dists[c])
            self.accumulator[c] = self.numerators[c] / outer_min_dist
        return sum(self.accumulator) / len(labels)
Esempio n. 7
0
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        prev_cluster_sizes = list(self.cluster_sizes)
        prev_centroids = np.copy(self.centroids)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(np.copy(labels), n_clusters)
        self.centroids = cluster_centroid.update_centroids(self.centroids, self.cluster_sizes, point, k, l)
        minimum_dif_c = sys.float_info.max  # min dist in different clusters

        #update numerator

        new_centroid_dists = list(self.centroid_dists)
        dell = 10**(-math.log(len(X), 10) - 1)
        for i in range(len(labels)):
            if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > dell * self.diameter
               or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > dell * self.diameter):
                new_centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]])


        for i in range(n_clusters):
            for j in range(n_clusters):
                self.delta[i][j] *= (prev_cluster_sizes[i] + prev_cluster_sizes[j])

        new_sums = [0 for _ in range(n_clusters)]
        for i in range(n_clusters):
            if i != k and i != l:
                new_sums[i] = self.sums[i]
        for i in range(len(labels)):
            if labels[i] == k or labels[i] == l:
                new_sums[labels[i]] += new_centroid_dists[i]

        for i in range(n_clusters):
            for j in range(n_clusters):
                if i != j:
                    if self.cluster_sizes[i] + self.cluster_sizes[j] == 0:
                        self.delta[i][j] = float('inf')
                    else:
                        self.delta[i][j] = (new_sums[i] + new_sums[j]) / float(self.cluster_sizes[i] + self.cluster_sizes[j])
                    minimum_dif_c = min(minimum_dif_c, self.delta[i][j])


        #update denominator
        denominator = list(new_sums)
        #print(denominator)
        for i in range(n_clusters):
            if self.cluster_sizes[i] == 0:
                denominator[i] = float('inf')
            else:
                denominator[i] *= (2 / self.cluster_sizes[i])

        return -(minimum_dif_c / max(denominator))
Esempio n. 8
0
    def update(self, X, n_clusters, labels, k, l, id):
        self.diameter = utils.find_diameter(X)
        prev_point_in_c = list(self.point_in_c)
        prev_centroids = np.copy(self.centroids)
        self.point_in_c = cluster_centroid.count_cluster_sizes(labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.point_in_c), X[id], k, l)
        minimum_dif_c = sys.float_info.max  # min dist in different clusters

        #update numerator

        for i in range(n_clusters):
            for j in range(n_clusters):
                self.delta[i][j] *= (prev_point_in_c[i] * prev_point_in_c[j])

        for i in range(len(labels)):
            if labels[i] != k and id < i:
                self.delta[k][labels[i]] -= self.dists[id][i]
            if labels[i] != k and id > i:
                self.delta[labels[i]][k] -= self.dists[i][id]
            if labels[i] != l and id < i:
                self.delta[l][labels[i]] += self.dists[id][i]
            if labels[i] != l and id > i:
                self.delta[labels[i]][l] += self.dists[i][id]

        for i in range(n_clusters - 1):
            for j in range(i + 1, n_clusters):
                self.delta[i][j] /= float(self.point_in_c[i] * self.point_in_c[j])
                if self.delta[i][j] != 0:
                    minimum_dif_c = min(minimum_dif_c, self.delta[i][j])
        # update denominator
        new_centroid_dists = list(self.centroid_dists)
        dell = 10 ** (-math.log(len(X), 10) - 1)
        for i in range(len(labels)):
            if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > dell * self.diameter
                or labels[i] == l and utils.euclidian_dist(prev_centroids[l],
                                                           self.centroids[l]) > dell * self.diameter):
                new_centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]])
        new_sums = [0 for _ in range(n_clusters)]
        for i in range(n_clusters):
            if i != k and i != l:
                new_sums[i] = self.sums[i]
        for i in range(len(labels)):
            if labels[i] == k or labels[i] == l:
                new_sums[labels[i]] += new_centroid_dists[i]
        denominator = list(new_sums)
        for i in range(n_clusters):
            if self.point_in_c[i] != 0:
                denominator[i] *= (2 / self.point_in_c[i])
        return -(minimum_dif_c / max(denominator))
Esempio n. 9
0
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        # prev_cluster_sizes = list(self.cluster_sizes)
        prev_centroids = np.copy(self.centroids)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            np.copy(labels), n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
        # update denominator

        new_centroid_dists = list(self.centroid_dists)
        dell = 10**(-math.log(len(X), 10) - 1)
        for i in range(len(labels)):
            if (labels[i] == k and
                    utils.euclidian_dist(prev_centroids[k], self.centroids[k])
                    > dell * self.diameter or labels[i] == l and
                    utils.euclidian_dist(prev_centroids[l], self.centroids[l])
                    > dell * self.diameter):
                new_centroid_dists[i] = utils.euclidian_dist(
                    X[i], self.centroids[labels[i]])
        new_sums = [0 for _ in range(n_clusters)]
        for i in range(n_clusters):
            if i != k and i != l:
                new_sums[i] = self.sums[i]
        for i in range(len(labels)):
            if labels[i] == k or labels[i] == l:
                new_sums[labels[i]] += new_centroid_dists[i]
        denominator = list(new_sums)
        for i in range(n_clusters):
            if self.cluster_sizes[i] != 0:
                denominator[i] *= (2 / self.cluster_sizes[i])

        # update numerator

        for i in range(n_clusters):
            if i != k:
                self.centers[i][k] = utils.euclidian_dist(
                    self.centroids[i], self.centroids[k])
                self.centers[k][i] = self.centers[i][k]
            if i != l:
                self.centers[i][l] = utils.euclidian_dist(
                    self.centroids[i], self.centroids[l])
                self.centers[l][i] = self.centers[i][l]

        minimum_dif_c = np.amin(self.centers)
        return -(minimum_dif_c / max(denominator))
Esempio n. 10
0
    def update(self, X, n_clusters, labels, k, l, id):
        prev_cluster_sizes = list(self.cluster_sizes)
        self.centroids = cluster_centroid.update_centroids(
            list(self.centroids), list(self.cluster_sizes), X[id], k, l)

        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)

        minimum_dif_c = sys.float_info.max  # min dist in different clusters
        maximum_same_c = sys.float_info.min  # max dist in the same cluster
        delete_from_same = []
        for i in range(0, len(labels)):
            if labels[i] == k:
                delete_from_same.append([i, id])
                delete_from_same.append([id, i])
            if labels[i] == l and i != id:

                self.dist_same_c.append([i, id])
                self.dist_same_c.append([id, i])

        for pair in self.dist_same_c:
            cur = self.dists[pair[0]][pair[1]]
            if cur > maximum_same_c:
                if pair not in delete_from_same:
                    maximum_same_c = cur
        for i in range(n_clusters - 1):
            for j in range(i + 1, n_clusters):
                self.delta[i][j] *= (prev_cluster_sizes[i] *
                                     prev_cluster_sizes[j])
        for i in range(len(labels)):
            if labels[i] != k and id < i:
                self.delta[k][labels[i]] -= self.dists[id][i]
            if labels[i] != k and id > i:
                self.delta[labels[i]][k] -= self.dists[i][id]
            if labels[i] != l and id < i:
                self.delta[l][labels[i]] += self.dists[id][i]
            if labels[i] != l and id > i:
                self.delta[labels[i]][l] += self.dists[i][id]
        for i in range(n_clusters - 1):
            for j in range(i + 1, n_clusters):
                self.delta[i][j] /= float(self.cluster_sizes[i] *
                                          self.cluster_sizes[j])
                if self.delta[i][j] != 0:
                    minimum_dif_c = min(minimum_dif_c, self.delta[i][j])
        return -minimum_dif_c / maximum_same_c
Esempio n. 11
0
    def update(self, X, n_clusters, labels, k, l, id):
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), X[id], k, l)
        self.a_ss[id] = self.a(X, labels, id, l)
        self.b_ss[id] = self.b(X, n_clusters, labels, id, l)
        for i in range(len(labels)):
            if i == id:
                continue
            if labels[i] == k:
                self.a_ss[i] *= (self.cluster_sizes[k] + 1)
                self.a_ss[i] -= self.dists_e[i][id]
                if self.cluster_sizes[k] == 0:
                    self.a_ss[i] = float('inf')
                else:
                    self.a_ss[i] /= self.cluster_sizes[k]
            if labels[i] == l:
                self.a_ss[i] *= (self.cluster_sizes[l] - 1)
                self.a_ss[i] += self.dists_e[i][id]
                if self.cluster_sizes[l] == 0:
                    self.a_ss[i] = float('inf')
                else:
                    self.a_ss[i] /= self.cluster_sizes[l]
            self.dists_for_b[i][l] *= (self.cluster_sizes[l] - 1)
            self.dists_for_b[i][l] += self.dists_e[i][id]
            if self.cluster_sizes[l] == 0:
                self.dists_for_b[i][l] = float('inf')
            else:
                self.dists_for_b[i][l] /= self.cluster_sizes[l]
            self.dists_for_b[i][k] *= (self.cluster_sizes[k] + 1)
            self.dists_for_b[i][k] -= self.dists_e[i][id]
            if self.cluster_sizes[k] == 0:
                self.dists_for_b[i][k] = float('inf')
            else:
                self.dists_for_b[i][k] /= self.cluster_sizes[k]
            self.b_ss[i] = min(self.dists_for_b[i])

        ch = 0
        for i in range(len(labels)):
            ch += (self.b_ss[i] - self.a_ss[i]) / max(self.b_ss[i],
                                                      self.a_ss[i])
        return -(ch / float(len(labels)))
Esempio n. 12
0
 def update(self, X, n_clusters, labels, k, l, id):
     point = X[id]
     prev_centroids = np.copy(self.centroids)
     delta = 10**(-math.log(len(X), 10) - 1)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
     self.centroids = cluster_centroid.update_centroids(self.centroids, self.cluster_sizes, point, k, l)
     if utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter:
         self.s_clusters[k] = self.s(X, k, self.cluster_sizes, labels, self.centroids)
     if utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter:
         self.s_clusters[l] = self.s(X, l, self.cluster_sizes, labels, self.centroids)
     for i in range(n_clusters):
         if i > k:
             self.max_s_sum[k][i] = self.s_clusters[i] + self.s_clusters[k]
             self.min_centroids_dist[k][i] = utils.euclidian_dist(self.centroids[i], self.centroids[k])
         if i > l:
             self.max_s_sum[l][i] = self.s_clusters[i] + self.s_clusters[l]
             self.min_centroids_dist[l][i] = utils.euclidian_dist(self.centroids[i], self.centroids[l])
     numerator = 0.0
     for i in range(n_clusters):
         numerator += np.max(self.max_s_sum[i]) / np.min(self.min_centroids_dist[i])
     return numerator / n_clusters
Esempio n. 13
0
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
        maximum_same_c = sys.float_info.min  # max dist in the same cluster
        delete_from_same = []

        #update denominator

        for i in range(len(labels)):
            if labels[i] == k:
                delete_from_same.append([i, id])
                delete_from_same.append([id, i])
            if labels[i] == l and i != id:
                self.dist_same_c.append([i, id])
                self.dist_same_c.append([id, i])

        for pair in self.dist_same_c:
            cur = self.dists[pair[0]][pair[1]]
            if cur > maximum_same_c:
                if pair not in delete_from_same:
                    maximum_same_c = cur

        #update numerator

        for i in range(n_clusters):
            if i != k:
                self.centers[i][k] = utils.euclidian_dist(
                    self.centroids[i], self.centroids[k])
                self.centers[k][i] = self.centers[i][k]
            if i != l:
                self.centers[i][l] = utils.euclidian_dist(
                    self.centroids[i], self.centroids[l])
                self.centers[l][i] = self.centers[i][l]
        minimum_dif_c = np.amin(self.centers)
        return -(minimum_dif_c / maximum_same_c)
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        delta = 10**(-math.log(len(X), 10) - 1)
        prev_centroids = np.copy(self.centroids)
        #self.points_in_clusters = cluster_centroid.count_cluster_sizes(labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            self.centroids, self.points_in_clusters, point, k, l)

        if utils.euclidian_dist(prev_centroids[k],
                                self.centroids[k]) > delta * self.diameter:
            self.s_clusters[k] = self.s(X, k, self.points_in_clusters, labels,
                                        self.centroids)
        if utils.euclidian_dist(prev_centroids[l],
                                self.centroids[l]) > delta * self.diameter:
            self.s_clusters[l] = self.s(X, l, self.points_in_clusters, labels,
                                        self.centroids)
        db = 0
        for i in range(n_clusters):
            if i != k:
                tm = utils.euclidian_dist(self.centroids[i], self.centroids[k])
                if tm != 0:
                    self.sums[i][k] = (self.s_clusters[i] +
                                       self.s_clusters[k]) / tm
                    self.sums[k][i] = (self.s_clusters[i] +
                                       self.s_clusters[k]) / tm
            if i != l:
                tm = utils.euclidian_dist(self.centroids[i], self.centroids[l])
                if tm != 0:
                    self.sums[i][l] = (self.s_clusters[i] +
                                       self.s_clusters[l]) / tm
                    self.sums[l][i] = (self.s_clusters[i] +
                                       self.s_clusters[l]) / tm
        for i in range(n_clusters):
            tmp = np.amax(self.sums[i])
            db += tmp
        db /= float(n_clusters)
        return db
Esempio n. 15
0
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        elements, ignore_columns = X.shape
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
        for i in range(elements):
            if labels[i] == k:
                self.dists[i][id] = 0
                self.dists[id][i] = 0
            if labels[i] == l:
                self.dists[id][i] = utils.euclidian_dist(X[i], X[id])
                self.dists[i][id] = self.dists[id][i]

        numerator = 0.0
        for i in range(elements):
            max_dist = np.amax(self.dists[i])
            numerator += max_dist / self.cluster_sizes[labels[i]]

        denominator = 0.0
        for i in range(n_clusters):
            if i != k:
                self.centroids_dist[k][i] = utils.euclidian_dist(
                    self.centroids[i], self.centroids[k])
                self.centroids_dist[i][k] = self.centroids_dist[k][i]
            if i != l:
                self.centroids_dist[l][i] = utils.euclidian_dist(
                    self.centroids[i], self.centroids[l])
                self.centroids_dist[i][l] = self.centroids_dist[l][i]

        for i in range(n_clusters):
            min_centroid_dist = np.amin(self.centroids_dist[i])
            denominator += min_centroid_dist

        return numerator / denominator
Esempio n. 16
0
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        prev_cluster_sizes = list(self.cluster_sizes)
        prev_centroids = np.copy(self.centroids)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
        minimum_dif_c = sys.float_info.max  # min dist in different clusters
        maximum_same_c = sys.float_info.min  # max dist in the same cluster
        delete_from_same = []

        #update denominator

        for i in range(0, len(labels)):
            if labels[i] == k:
                delete_from_same.append([i, id])
                delete_from_same.append([id, i])
            if labels[i] == l and i != id:
                self.dists_same_c.append([i, id])
                self.dists_same_c.append([id, i])

        for pair in self.dists_same_c:
            cur = self.dists[pair[0]][pair[1]]
            if cur > maximum_same_c:
                if pair not in delete_from_same:
                    maximum_same_c = cur

        #update numerator

        new_centroid_dists = list(self.centroid_dists)
        dell = 10**(-math.log(len(X), 10) - 1)
        for i in range(len(labels)):
            if (labels[i] == k and
                    utils.euclidian_dist(prev_centroids[k], self.centroids[k])
                    > dell * self.diameter or labels[i] == l and
                    utils.euclidian_dist(prev_centroids[l], self.centroids[l])
                    > dell * self.diameter):
                new_centroid_dists[i] = utils.euclidian_dist(
                    X[i], self.centroids[labels[i]])

        for i in range(n_clusters):
            for j in range(n_clusters):
                self.delta[i][j] *= (prev_cluster_sizes[i] +
                                     prev_cluster_sizes[j])

        new_sums = [0 for _ in range(n_clusters)]
        for i in range(n_clusters):
            if i != k and i != l:
                new_sums[i] = self.sums[i]
        for i in range(len(labels)):
            if labels[i] == k or labels[i] == l:
                new_sums[labels[i]] += new_centroid_dists[i]

        for i in range(n_clusters):
            for j in range(n_clusters):
                if i != j:
                    self.delta[i][j] = (new_sums[i] + new_sums[j]) / float(
                        self.cluster_sizes[i] + self.cluster_sizes[j])
                    minimum_dif_c = min(minimum_dif_c, self.delta[i][j])
        return -(minimum_dif_c / maximum_same_c)
Esempio n. 17
0
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        prev_centroids = np.copy(self.centroids)
        prev_cluster_sizes = list(self.cluster_sizes)
        self.centroids = cluster_centroid.update_centroids(self.centroids, self.cluster_sizes, point, k, l)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
        prev_dists_for_b = list(self.dists_for_b)
        self.a_ss[id] = self.a(X, labels, id, l)
        self.b_ss[id] = self.b(X, labels, id, l)
        for i in range(len(labels)):
            if i == id:
                continue
            if labels[i] == k:
                self.a_ss[i] *= prev_cluster_sizes[k]
                self.a_ss[i] -= self.dists_e[i][id]
                self.a_ss[i] /= self.cluster_sizes[k]
                self.b_ss[i] *= prev_cluster_sizes[k]
                j = prev_cluster_sizes[k]
                #if prev_dists_for_b[i][j] != float('inf'):
                    #self.b_ss[i] -= prev_dists_for_b[i][j]
                self.dists_for_b[i][id] = self.dists_e[i][id]
                if self.max_b_ss[i] > self.dists_e[i][id]:
                    self.b_ss[i] -= self.max_b_ss[i]
                    self.b_ss[i] += self.dists_e[i][id]
                    filtered = [x for x in self.dists_for_b[i] if x != float('inf')]
                    self.max_b_ss[i] = max(filtered)
                elif self.b_ss_size[i] < self.cluster_sizes[k]:
                    self.b_ss[i] += self.dists_e[i][id]
                    self.b_ss_size[i] += 1
                self.b_ss[i] /= self.cluster_sizes[k]
            if labels[i] == l:
                self.a_ss[i] *= prev_cluster_sizes[l]
                self.a_ss[i] += self.dists_e[i][id]
                self.a_ss[i] /= self.cluster_sizes[l]
                self.b_ss[i] *= prev_cluster_sizes[l]
                j = prev_cluster_sizes[l]
                #self.b_ss[i] += self.dists_e[i][j]
                self.b_ss[i] -= self.dists_e[i][id]
                self.b_ss_size[i] -= 1
                self.dists_for_b[i][id] = float('inf')
                if self.b_ss_size[i] < self.cluster_sizes[l]:
                    filtered = [x for x in self.dists_for_b[i] if x != float('inf')]
                    new_max = max(filtered)
                    self.b_ss[i] += new_max
                    self.b_ss_size[i] += 1
                    self.max_b_ss[i] = new_max
                #if self.dists_for_b[i][j] < self.dists_e[i][id]:
                    #self.b_ss[i] -= self.dists_e[i][id]
                #    self.b_ss[i] += self.dists_for_b[i][j]
                #elif self.dists_for_b[i][j - 1] == float('inf'):

                self.b_ss[i] /= self.cluster_sizes[l]
        numerator = 0.0
        for c in range(n_clusters):
            for i in range(len(labels)):
                if labels[i] != c:
                    continue
                numerator += self.ov(i)
        denominator = 0.0
        self.dists[k][id] = 0.
        #delta = 10**(-math.log(len(X), 10) - 1)
        delta = 10**(-9)
        for i in range(len(labels)):
            if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter
               or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter):
                self.dists[labels[i]][i] = utils.euclidian_dist(X[i], self.centroids[labels[i]])
        for c in range(n_clusters):
            # get sum of 0.1*|Ck| largest elements
            max_n = heapq.nlargest(int(math.ceil(0.1 * self.cluster_sizes[c])), self.dists[c])
            denominator += sum(max_n) * 10.0 / self.cluster_sizes[c]
        return -(numerator / denominator)