def find(self, X, labels, n_clusters):
     self.diameter = utils.find_diameter(X)
     self.centroids = cluster_centroid.cluster_centroid(
         X, labels, n_clusters)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(
         labels, n_clusters)
     self.dists = [[0 for _ in range(len(labels))]
                   for _ in range(len(labels))]
     self.centroid_dists = [0 for _ in range(len(labels))]
     self.delta = [[0 for _ in range(n_clusters)]
                   for _ in range(n_clusters)]
     minimum_dif_c = sys.float_info.max  # min dist in different clusters
     maximum_same_c = sys.float_info.min  # max dist in the same cluster
     self.sums = [0 for _ in range(n_clusters)]
     for i in range(len(labels)):
         self.centroid_dists[i] = utils.euclidian_dist(
             X[i], self.centroids[labels[i]])
         self.sums[labels[i]] += self.centroid_dists[i]
     for i in range(len(labels) - 1):
         for j in range(i + 1, len(labels)):
             self.dists[i][j] = utils.euclidian_dist(X[i], X[j])
             self.dists[j][i] = self.dists[i][j]
             self.dists_same_c.append([i, j])
             maximum_same_c = max(self.dists[i][j], maximum_same_c)
     for i in range(n_clusters):
         for j in range(n_clusters):
             if i != j:
                 self.delta[i][j] = (self.sums[i] + self.sums[j]) / float(
                     self.cluster_sizes[i] + self.cluster_sizes[j])
                 minimum_dif_c = min(minimum_dif_c, self.delta[i][j])
     return -(minimum_dif_c / maximum_same_c)
Example #2
0
 def update(self, X, n_clusters, labels, k, l, id):
     point = X[id]
     prev_centroids = np.copy(self.centroids)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
     self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
     delta = 10**(-math.log(len(X), 10) - 1)
     for i in range(len(labels)):
         if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter
            or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter):
             self.dist_ps[i] = utils.d_ps(X, labels, X[i], labels[i], self.centroids)
     if utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter:
         self.sym_s_clusters[k] = self.sym_s(X, labels, k, self.cluster_sizes, self.centroids)
     if utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter:
         self.sym_s_clusters[l] = self.sym_s(X, labels, l, self.cluster_sizes, self.centroids)
     db = 0
     for i in range(n_clusters):
         if i != k:
             tm = utils.euclidian_dist(self.centroids[i], self.centroids[k])
             self.fractions[i][k] = (self.sym_s_clusters[i] + self.sym_s_clusters[k]) / tm
             self.fractions[k][i] = self.fractions[i][k]
         if i != l:
             tm = utils.euclidian_dist(self.centroids[i], self.centroids[l])
             self.fractions[i][l] = (self.sym_s_clusters[i] + self.sym_s_clusters[l]) / tm
             self.fractions[l][i] = self.fractions[i][l]
     for i in range(n_clusters):
         tmp = np.amax(self.fractions[i])
         db += tmp
     db /= float(n_clusters)
     return db
Example #3
0
    def find(self, X, labels, n_clusters):
        self.diameter = utils.find_diameter(X)
        self.centroids = cluster_centroid.cluster_centroid(
            X, labels, n_clusters)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.dist_same_c = []
        rows, colums = X.shape
        self.dists = [[0. for _ in range(rows)] for _ in range(rows)]
        minimum_dif_c = sys.float_info.max  # min dist in different clusters
        maximum_same_c = sys.float_info.min  # max dist in the same cluster
        centres_l = [[sys.float_info.max] * n_clusters] * n_clusters
        self.centers = np.array(centres_l)
        for i in range(n_clusters):
            for j in range(n_clusters):
                if i != j:
                    self.centers[i][j] = utils.euclidian_dist(
                        self.centroids[i], self.centroids[j])

        for i in range(rows - 1):
            for j in range(i + 1, rows):
                self.dists[i][j] = utils.euclidian_dist(X[i], X[j])
                self.dists[j][i] = self.dists[i][j]
                if labels[i] != labels[j]:
                    dist = self.centers[labels[i]][labels[j]]
                    minimum_dif_c = min(dist, minimum_dif_c)
                else:
                    self.dist_same_c.append([i, j])
                    maximum_same_c = max(self.dists[i][j], maximum_same_c)
        return -(minimum_dif_c / maximum_same_c)
    def find(self, X, labels, n_clusters):
        self.diameter = utils.find_diameter(X)
        self.centroids = cluster_centroid.cluster_centroid(
            X, labels, n_clusters)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        rows, colums = X.shape
        self.sums = [0 for _ in range(n_clusters)]
        minimum_dif_c = sys.float_info.max  # min dist in different clusters
        centres_l = [[sys.float_info.max] * n_clusters] * n_clusters
        self.centers = np.array(centres_l)
        self.centroid_dists = [0 for _ in range(len(labels))]
        # self.centroid_dists = [utils.euclidian_dist(X[i], self.centroids[labels[i]]) for i in range(len(X))]
        for i in range(len(labels)):
            self.centroid_dists[i] = utils.euclidian_dist(
                X[i], self.centroids[labels[i]])
            self.sums[labels[i]] += self.centroid_dists[i]
        for i in range(n_clusters):
            for j in range(n_clusters):
                if i != j:
                    self.centers[i][j] = utils.euclidian_dist(
                        self.centroids[i], self.centroids[j])
        for i in range(rows):
            for j in range(rows):
                if labels[i] != labels[j]:
                    dist = self.centers[labels[i]][labels[j]]
                    minimum_dif_c = min(dist, minimum_dif_c)

        denominator = list(self.sums)
        for i in range(n_clusters):
            denominator[i] *= (2 / self.cluster_sizes[i])

        return -(minimum_dif_c / max(denominator))
 def find(self, X, labels, n_clusters):
     self.centroids = cluster_centroid.cluster_centroid(
         X, labels, n_clusters)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(
         labels, n_clusters)
     self.diameter = utils.find_diameter(X)
     self.dists = [[0. for _ in range(len(labels))]
                   for _ in range(len(labels))]
     self.dist_same_c = []
     rows, colums = X.shape
     delta_l = [[0.0] * n_clusters] * n_clusters
     self.delta = np.array(delta_l)
     minimum_dif_c = sys.float_info.max  # min dist in different clusters
     maximum_same_c = sys.float_info.min  # max dist in the same cluster
     for i in range(rows - 1):
         for j in range(i + 1, rows):
             self.dists[i][j] = utils.euclidian_dist(X[i], X[j])
             self.dists[j][i] = self.dists[i][j]
             if labels[i] != labels[j]:
                 self.delta[labels[i]][labels[j]] += self.dists[i][j]
             else:
                 self.dist_same_c.append([i, j])
                 maximum_same_c = max(self.dists[i][j], maximum_same_c)
     for i in range(n_clusters - 1):
         for j in range(i + 1, n_clusters):
             self.delta[i][j] /= float(self.cluster_sizes[i] *
                                       self.cluster_sizes[j])
             if self.delta[i][j] != 0:
                 minimum_dif_c = min(minimum_dif_c, self.delta[i][j])
     return -minimum_dif_c / maximum_same_c
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        prev_centroids = np.copy(self.centroids)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
        delta = 10**(-math.log(len(X), 10) - 1)
        if utils.euclidian_dist(prev_centroids[k],
                                self.centroids[k]) > delta * self.diameter:
            self.sigmas[k] = self.normed_cluster_sigma(X, labels, k)
        if utils.euclidian_dist(prev_centroids[l],
                                self.centroids[l]) > delta * self.diameter:
            self.sigmas[l] = self.normed_cluster_sigma(X, labels, l)
        term1 = sum(self.sigmas) / (n_clusters * self.normed_sigma_x)
        stdev_val = self.stdev(n_clusters)

        if (utils.euclidian_dist(prev_centroids[k],
                                 self.centroids[k]) > delta * self.diameter
                or utils.euclidian_dist(prev_centroids[l], self.centroids[l]) >
                delta * self.diameter):
            self.dens = 0.0
            for k in range(0, n_clusters):
                for l in range(0, n_clusters):
                    self.dens += self.den2(X, labels, self.centroids, k, l, stdev_val) /\
                            max(self.den1(X, labels, self.centroids, k, stdev_val),
                                self.den1(X, labels, self.centroids, l, stdev_val))

        self.dens /= n_clusters * (n_clusters - 1)
        return (term1 + self.dens)
Example #7
0
 def find(self, X, labels, n_clusters):
     self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
     self.dists = [[0. for _ in range(len(labels))] for _ in range(len(labels))]
     self.sums = [0 for _ in range(n_clusters)]
     rows, colums = X.shape
     self.point_in_c = cluster_centroid.count_cluster_sizes(labels, n_clusters)
     self.delta_l = [[0.0] * n_clusters] * n_clusters
     self.delta = np.array(self.delta_l)
     self.centroid_dists = [0 for _ in range(len(labels))]
     #self.centroid_dists = [utils.euclidian_dist(X[i], self.centroids[labels[i]]) for i in range(len(X))]
     minimum_dif_c = sys.float_info.max
     for i in range(len(labels)):
         self.centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]])
         self.sums[labels[i]] += self.centroid_dists[i]
     for i in range(rows - 1):
         for j in range(i + 1, rows):
             self.dists[i][j] = utils.euclidian_dist(X[i], X[j])
             self.dists[j][i] = self.dists[i][j]
             if labels[i] != labels[j]:
                 self.delta[labels[i]][labels[j]] += self.dists[i][j]
     for i in range(n_clusters):
         for j in range(n_clusters):
             self.delta[i][j] /= float(self.point_in_c[i] * self.point_in_c[j])
             if self.delta[i][j] != 0:
                 minimum_dif_c = min(minimum_dif_c, self.delta[i][j])
         self.sums[i] *= (2 / self.point_in_c[i])
     #print(max(self.sums))
     return -(minimum_dif_c / max(self.sums))
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
        self.numerators[k] = 0.0
        self.numerators[l] = 0.0
        for i in range(len(labels)):
            if labels[i] == k or labels[i] == l:
                self.numerators[labels[i]] += utils.euclidian_dist(
                    X[i], self.centroids[labels[i]])
        for i in range(len(labels)):
            if labels[i] == k:
                self.inner_max_dists[i][id] = utils.euclidian_dist(X[i], X[id])
                self.inner_max_dists[id][i] = self.inner_max_dists[i][id]
            if labels[i] == l:
                self.inner_max_dists[i][id] = 0
                self.inner_max_dists[id][i] = 0
                self.outer_min_dists[l][id] = sys.float_info.max
        for c in [k, l]:
            for i in range(len(labels)):
                if labels[i] == c:
                    continue
                inner_max_dist = 0
                for j in range(len(self.inner_max_dists[i])):
                    if labels[j] == c:
                        inner_max_dist = max(inner_max_dist,
                                             self.inner_max_dists[i][j])

                if inner_max_dist != 0:
                    self.outer_min_dists[c][i] = inner_max_dist
            outer_min_dist = np.amin(self.outer_min_dists[c])
            self.accumulator[c] = self.numerators[c] / outer_min_dist
        return sum(self.accumulator) / len(labels)
Example #9
0
def sv(X, labels, n_clusters):
    centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
    cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)

    numerator = 0.0
    for k in range(0, n_clusters - 1):
        min_dist = sys.float_info.max
        for l in range(k + 1, n_clusters):
            min_dist = min(min_dist,
                           utils.euclidian_dist(centroids[k], centroids[l]))
        numerator += min_dist

    denominator = 0.0
    for k in range(0, n_clusters):
        list = []
        for i in range(0, len(labels)):
            if labels[i] != k:
                continue
            list.append(utils.euclidian_dist(X[i], centroids[k]))

        # get sum of 0.1*|Ck| largest elements
        acc = 0.0
        max_n = heapq.nlargest(int(math.ceil(0.1 * cluster_sizes[k])), list)
        for i in range(0, len(max_n)):
            acc += max_n[i]
        denominator += acc * 10.0 / cluster_sizes[k]
    return -numerator / denominator
Example #10
0
def cs_index(X, labels, n_clusters):
    elements, ignore_columns = X.shape
    centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
    cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
    max_dists = [sys.float_info.min] * elements

    for i in range(0, elements):  # for every element
        for j in range(i, elements - 1):  # for every other
            if labels[i] != labels[j]:
                continue  # if they are in the same cluster
            # update the distance to the farthest element in the same cluster
            max_dists[i] = max(max_dists[i], utils.euclidian_dist(X[i], X[j]))

    # max_dists contain for each element the farthest the his cluster

    numerator = 0.0
    for i in range(0, elements):
        numerator += max_dists[i] / cluster_sizes[labels[i]]

    denominator = 0.0
    for i in range(0, n_clusters):
        min_centroids_dist = sys.float_info.max
        for j in range(i + 1, n_clusters):
            min_centroids_dist = min(
                utils.euclidian_dist(centroids[i], centroids[j]),
                min_centroids_dist)
        denominator += min_centroids_dist

    assert denominator != 0.0
    return numerator / denominator
Example #11
0
 def update(self, X, n_clusters, labels, k, l, id):
     point = X[id]
     prev_centroids = np.copy(self.centroids)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
     self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
     for i in range(n_clusters):
         if i > k:
             self.centroid_dists[k][i] = utils.euclidian_dist(self.centroids[i], self.centroids[k])
             self.centroid_dists[i][k] = self.centroid_dists[k][i]
         if i > l:
             self.centroid_dists[l][i] = utils.euclidian_dist(self.centroids[i], self.centroids[l])
             self.centroid_dists[i][l] = self.centroid_dists[l][i]
     numerator = 0.0
     for i in range(n_clusters):
         min_dist = np.amin(self.centroid_dists[i])
         numerator += min_dist
     denominator = 0.0
     self.dists[k][id] = 0.
     delta = 10**(-math.log(len(X), 10) - 1)
     for i in range(len(labels)):
         if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter
            or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter):
             self.dists[labels[i]][i] = utils.euclidian_dist(X[i], self.centroids[labels[i]])
     for c in range(n_clusters):
         # get sum of 0.1*|Ck| largest elements
         acc = 0.0
         max_n = heapq.nlargest(int(math.ceil(0.1 * self.cluster_sizes[c])), self.dists[c])
         for i in range(0, len(max_n)):
             acc += max_n[i]
         denominator += acc * 10.0 / self.cluster_sizes[c]
     return -(numerator / denominator)
Example #12
0
def c_ind(X, labels, n_clusters):
    rows, colums = X.shape
    s_c = 0
    for i in range(0, rows):
        for j in range(0, int(math.ceil(float(rows) / 2.0))):
            s_c += utils.euclidian_dist(X[i], X[j])
    cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)

    n_w = 0
    for k in range(0, n_clusters):
        n_w += cluster_sizes[k] * (cluster_sizes[k] - 1) / 2

    distances = []
    for i in range(0, len(labels) - 1):
        for j in range(i + 1, len(labels)):
            distances.append(utils.euclidian_dist(X[i], X[j]))

    s_min = heapq.nsmallest(int(n_w), distances)
    s_max = heapq.nlargest(int(n_w), distances)

    ones = [1] * int(n_w)
    s_min_c = np.dot(s_min, np.transpose(ones))
    s_max_c = np.dot(s_max, np.transpose(ones))
    # TODO check dot product correct
    return (s_c - s_min_c) / (s_max_c - s_min_c)
Example #13
0
 def update(self, X, n_clusters, labels, k, l, id):
     point = X[id]
     prev_centroids = np.copy(self.centroids)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(
         labels, n_clusters)
     self.centroids = cluster_centroid.update_centroids(
         np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
     for i in range(n_clusters):
         if i > k:
             self.dist_centroids[k][i] = utils.euclidian_dist(
                 self.centroids[i], self.centroids[k])
         if i > l:
             self.dist_centroids[l][i] = utils.euclidian_dist(
                 self.centroids[i], self.centroids[l])
     numerator = np.amax(self.dist_centroids)
     delta = 10**(-math.log(len(X), 10) - 1)
     for i in range(len(labels)):
         if (labels[i] == k and
                 utils.euclidian_dist(prev_centroids[k], self.centroids[k])
                 > delta * self.diameter or labels[i] == l and
                 utils.euclidian_dist(prev_centroids[l], self.centroids[l])
                 > delta * self.diameter):
             self.dist_ps[i] = utils.d_ps(X, labels, X[i], labels[i],
                                          self.centroids)
     denominator = sum(self.dist_ps)
     return -(numerator / (denominator * n_clusters))
Example #14
0
    def update(self, X, n_clusters, labels, k, l, id):
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.cluster_sizes),
                                                                                 X[id], k, l)
        #self.cluster_sizes[k] -= 1
        #self.cluster_sizes[l] += 1
        for i in range(len(labels)):
            if labels[i] == k:
                self.s_c -= utils.euclidian_dist(X[i], X[id])
            if labels[i] == l:
                self.s_c += utils.euclidian_dist(X[i], X[id])
        prev_n_w = self.n_w
        self.n_w = self.n_w - (self.cluster_sizes[k] + 1) * self.cluster_sizes[k] / 2 + self.cluster_sizes[k] * (self.cluster_sizes[k] - 1) / 2 \
                - (self.cluster_sizes[l] - 1) * (self.cluster_sizes[l] - 2) / 2 + self.cluster_sizes[l] * (self.cluster_sizes[l] - 1) / 2

        delta = 0.1
        #print(prev_n_w)
        #print(self.n_w)
        #print(delta * len(labels))

        if abs(self.n_w - prev_n_w) > delta * len(labels):
            self.s_min = heapq.nsmallest(int(self.n_w), self.distances)
            self.s_max = heapq.nlargest(int(self.n_w), self.distances)

        #ones = [1] * int(self.n_w)
        #s_min_c = np.dot(self.s_min, np.transpose(ones))
        #s_max_c = np.dot(self.s_max, np.transpose(ones))
        s_min_c = sum(self.s_min)
        s_max_c = sum(self.s_max)
        return (self.s_c - s_min_c) / (s_max_c - s_min_c)
Example #15
0
    def find(self, X, labels, n_clusters):
        self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
        self.diameter = utils.find_diameter(X)
        self.cluster_sizes = []
        self.distances = []
        self.s_c = 0
        self.n_w = 0
        rows, colums = X.shape
        for i in range(rows - 1):
            for j in range(i + 1, rows):
                if labels[i] == labels[j]:
                    self.s_c += utils.euclidian_dist(X[i], X[j])
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)

        for k in range(0, n_clusters):
            self.n_w += self.cluster_sizes[k] * (self.cluster_sizes[k] - 1) / 2

        for i in range(0, len(labels) - 1):
            for j in range(i + 1, len(labels)):
                self.distances.append(utils.euclidian_dist(X[i], X[j]))

        self.s_min = heapq.nsmallest(int(self.n_w), self.distances)
        self.s_max = heapq.nlargest(int(self.n_w), self.distances)

        #ones = [1] * int(self.n_w)
        #s_min_c = np.dot(self.s_min, np.transpose(ones))
        #s_max_c = np.dot(self.s_max, np.transpose(ones))
        s_min_c = sum(self.s_min)
        s_max_c = sum(self.s_max)
        return (self.s_c - s_min_c) / (s_max_c - s_min_c)
Example #16
0
    def find(self, X, labels, n_clusters):
        self.diameter = utils.find_diameter(X)
        self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)

        self.centroid_dists = [[sys.float_info.max for _ in range(n_clusters)] for _ in range(n_clusters)]
        self.dists = [[0 for _ in range(len(labels))] for _ in range(n_clusters)]
        numerator = 0.0
        for k in range(0, n_clusters - 1):
            for l in range(k + 1, n_clusters):
                self.centroid_dists[k][l] = utils.euclidian_dist(self.centroids[k], self.centroids[l])
                self.centroid_dists[l][k] = self.centroid_dists[k][l]
        for i in range(n_clusters):
            min_dist = np.amin(self.centroid_dists[i])
            numerator += min_dist
        denominator = 0.0

        for k in range(n_clusters):
            for i in range(len(labels)):
                if labels[i] != k:
                    continue
                self.dists[k][i] = utils.euclidian_dist(X[i], self.centroids[k])
        for k in range(n_clusters):
            # get sum of 0.1*|Ck| largest elements
            acc = 0.0
            max_n = heapq.nlargest(int(math.ceil(0.1 * self.cluster_sizes[k])), self.dists[k])
            for i in range(0, len(max_n)):
                acc += max_n[i]
            denominator += acc * 10.0 / self.cluster_sizes[k]
        return -(numerator / denominator)
Example #17
0
def os(X, labels, n_clusters):
    centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
    cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)

    numerator = 0.0
    for k in range(0, n_clusters):
        for i in range(0, len(labels)):
            if labels[i] != k: continue
            numerator += ov(X, labels, X[i], k)

    denominator = 0.0
    for k in range(0, n_clusters):
        l = []
        for i in range(0, len(labels)):
            if labels[i] != k:
                continue
            l.append(utils.euclidian_dist(X[i], centroids[k]))

        # get sum of 0.1*|Ck| largest elements
        acc = 0.0
        max_n = heapq.nlargest(int(math.ceil(0.1 * cluster_sizes[k])), l)
        for i in range(0, len(max_n)):
            acc += max_n[i]

        denominator += acc * 10.0 / cluster_sizes[k]

    return -numerator / denominator
 def find(self, X, labels, n_clusters):
     self.diameter = utils.find_diameter(X)
     self.s_clusters = [0. for _ in range(n_clusters)]
     self.centroids = cluster_centroid.cluster_centroid(
         X, labels, n_clusters)
     db = 0
     self.points_in_clusters = cluster_centroid.count_cluster_sizes(
         labels, n_clusters)
     for i in range(n_clusters):
         self.s_clusters[i] = self.s(X, i, self.points_in_clusters, labels,
                                     self.centroids)
     self.sums = [[0 for _ in range(n_clusters)] for _ in range(n_clusters)]
     for i in range(0, n_clusters):
         for j in range(0, n_clusters):
             if i != j:
                 tm = utils.euclidian_dist(self.centroids[i],
                                           self.centroids[j])
                 if tm != 0:
                     self.sums[i][j] = (self.s_clusters[i] +
                                        self.s_clusters[j]) / tm
                 else:
                     pass
                     #a = -Constants.bad_cluster
         tmp = np.amax(self.sums[i])
         db += tmp
     db /= float(n_clusters)
     return db
Example #19
0
def sf(X, labels, n_clusters):
    centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
    cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)

    bcd = bcd_score(X, labels, n_clusters, centroids, cluster_sizes)
    wcd = wcd_score(X, labels, n_clusters, centroids, cluster_sizes)
    p = math.exp(-bcd - wcd)  #?????

    return -(1.0 - 1.0 / math.exp(p))
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        prev_cluster_sizes = list(self.cluster_sizes)
        prev_centroids = np.copy(self.centroids)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(np.copy(labels), n_clusters)
        self.centroids = cluster_centroid.update_centroids(self.centroids, self.cluster_sizes, point, k, l)
        minimum_dif_c = sys.float_info.max  # min dist in different clusters

        #update numerator

        new_centroid_dists = list(self.centroid_dists)
        dell = 10**(-math.log(len(X), 10) - 1)
        for i in range(len(labels)):
            if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > dell * self.diameter
               or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > dell * self.diameter):
                new_centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]])


        for i in range(n_clusters):
            for j in range(n_clusters):
                self.delta[i][j] *= (prev_cluster_sizes[i] + prev_cluster_sizes[j])

        new_sums = [0 for _ in range(n_clusters)]
        for i in range(n_clusters):
            if i != k and i != l:
                new_sums[i] = self.sums[i]
        for i in range(len(labels)):
            if labels[i] == k or labels[i] == l:
                new_sums[labels[i]] += new_centroid_dists[i]

        for i in range(n_clusters):
            for j in range(n_clusters):
                if i != j:
                    if self.cluster_sizes[i] + self.cluster_sizes[j] == 0:
                        self.delta[i][j] = float('inf')
                    else:
                        self.delta[i][j] = (new_sums[i] + new_sums[j]) / float(self.cluster_sizes[i] + self.cluster_sizes[j])
                    minimum_dif_c = min(minimum_dif_c, self.delta[i][j])


        #update denominator
        denominator = list(new_sums)
        #print(denominator)
        for i in range(n_clusters):
            if self.cluster_sizes[i] == 0:
                denominator[i] = float('inf')
            else:
                denominator[i] *= (2 / self.cluster_sizes[i])

        return -(minimum_dif_c / max(denominator))
Example #21
0
    def update(self, X, n_clusters, labels, k, l, id):
        self.diameter = utils.find_diameter(X)
        prev_point_in_c = list(self.point_in_c)
        prev_centroids = np.copy(self.centroids)
        self.point_in_c = cluster_centroid.count_cluster_sizes(labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.point_in_c), X[id], k, l)
        minimum_dif_c = sys.float_info.max  # min dist in different clusters

        #update numerator

        for i in range(n_clusters):
            for j in range(n_clusters):
                self.delta[i][j] *= (prev_point_in_c[i] * prev_point_in_c[j])

        for i in range(len(labels)):
            if labels[i] != k and id < i:
                self.delta[k][labels[i]] -= self.dists[id][i]
            if labels[i] != k and id > i:
                self.delta[labels[i]][k] -= self.dists[i][id]
            if labels[i] != l and id < i:
                self.delta[l][labels[i]] += self.dists[id][i]
            if labels[i] != l and id > i:
                self.delta[labels[i]][l] += self.dists[i][id]

        for i in range(n_clusters - 1):
            for j in range(i + 1, n_clusters):
                self.delta[i][j] /= float(self.point_in_c[i] * self.point_in_c[j])
                if self.delta[i][j] != 0:
                    minimum_dif_c = min(minimum_dif_c, self.delta[i][j])
        # update denominator
        new_centroid_dists = list(self.centroid_dists)
        dell = 10 ** (-math.log(len(X), 10) - 1)
        for i in range(len(labels)):
            if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > dell * self.diameter
                or labels[i] == l and utils.euclidian_dist(prev_centroids[l],
                                                           self.centroids[l]) > dell * self.diameter):
                new_centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]])
        new_sums = [0 for _ in range(n_clusters)]
        for i in range(n_clusters):
            if i != k and i != l:
                new_sums[i] = self.sums[i]
        for i in range(len(labels)):
            if labels[i] == k or labels[i] == l:
                new_sums[labels[i]] += new_centroid_dists[i]
        denominator = list(new_sums)
        for i in range(n_clusters):
            if self.point_in_c[i] != 0:
                denominator[i] *= (2 / self.point_in_c[i])
        return -(minimum_dif_c / max(denominator))
Example #22
0
def sym_db(X, labels, n_clusters):
    centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
    db = 0
    cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
    max_fraction = sys.float_info.min
    for k in range(0, n_clusters):
        for l in range(0, n_clusters):
            if k != l:
                fraction = ((sym_s(X, labels, k, cluster_sizes, centroids) +
                             sym_s(X, labels, l, cluster_sizes, centroids)) /
                            utils.euclidian_dist(centroids[k], centroids[l]))
                max_fraction = max(max_fraction, fraction)
        db += max_fraction
    db /= float(n_clusters)
    return db
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        # prev_cluster_sizes = list(self.cluster_sizes)
        prev_centroids = np.copy(self.centroids)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            np.copy(labels), n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
        # update denominator

        new_centroid_dists = list(self.centroid_dists)
        dell = 10**(-math.log(len(X), 10) - 1)
        for i in range(len(labels)):
            if (labels[i] == k and
                    utils.euclidian_dist(prev_centroids[k], self.centroids[k])
                    > dell * self.diameter or labels[i] == l and
                    utils.euclidian_dist(prev_centroids[l], self.centroids[l])
                    > dell * self.diameter):
                new_centroid_dists[i] = utils.euclidian_dist(
                    X[i], self.centroids[labels[i]])
        new_sums = [0 for _ in range(n_clusters)]
        for i in range(n_clusters):
            if i != k and i != l:
                new_sums[i] = self.sums[i]
        for i in range(len(labels)):
            if labels[i] == k or labels[i] == l:
                new_sums[labels[i]] += new_centroid_dists[i]
        denominator = list(new_sums)
        for i in range(n_clusters):
            if self.cluster_sizes[i] != 0:
                denominator[i] *= (2 / self.cluster_sizes[i])

        # update numerator

        for i in range(n_clusters):
            if i != k:
                self.centers[i][k] = utils.euclidian_dist(
                    self.centroids[i], self.centroids[k])
                self.centers[k][i] = self.centers[i][k]
            if i != l:
                self.centers[i][l] = utils.euclidian_dist(
                    self.centroids[i], self.centroids[l])
                self.centers[l][i] = self.centers[i][l]

        minimum_dif_c = np.amin(self.centers)
        return -(minimum_dif_c / max(denominator))
Example #24
0
    def find(self, X, labels, n_clusters):
        self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
        self.max_s_sum = [[sys.float_info.min for _ in range(n_clusters)] for _ in range(n_clusters)]
        self.min_centroids_dist = [[sys.float_info.max for _ in range(n_clusters)] for _ in range(n_clusters)]
        self.s_clusters = [0 for _ in range(n_clusters)]
        self.diameter = utils.find_diameter(X)
        for i in range(n_clusters):
            self.s_clusters[i] = self.s(X, i, self.cluster_sizes, labels, self.centroids)
        numerator = 0.0
        for k in range(0, n_clusters):
            for l in range(k + 1, n_clusters):
                self.max_s_sum[k][l] = self.s_clusters[k] + self.s_clusters[l]
                self.min_centroids_dist[k][l] = utils.euclidian_dist(self.centroids[k], self.centroids[l])

            numerator += np.max(self.max_s_sum[k]) / np.min(self.min_centroids_dist[k])
        return numerator / n_clusters
Example #25
0
def db_star_index(X, labels, n_clusters):
    centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
    cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
    numerator = 0.0
    for k in range(0, n_clusters):
        max_s_sum = sys.float_info.min
        min_centroids_dist = sys.float_info.max
        for l in range(k + 1, n_clusters):
            max_s_sum = max(
                max_s_sum,
                s(X, k, cluster_sizes, labels, centroids) +
                s(X, l, cluster_sizes, labels, centroids))
            min_centroids_dist = min(
                min_centroids_dist,
                utils.euclidian_dist(centroids[k], centroids[l]))
        numerator += max_s_sum / min_centroids_dist
    return numerator / n_clusters
    def update(self, X, n_clusters, labels, k, l, id):
        prev_cluster_sizes = list(self.cluster_sizes)
        self.centroids = cluster_centroid.update_centroids(
            list(self.centroids), list(self.cluster_sizes), X[id], k, l)

        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)

        minimum_dif_c = sys.float_info.max  # min dist in different clusters
        maximum_same_c = sys.float_info.min  # max dist in the same cluster
        delete_from_same = []
        for i in range(0, len(labels)):
            if labels[i] == k:
                delete_from_same.append([i, id])
                delete_from_same.append([id, i])
            if labels[i] == l and i != id:

                self.dist_same_c.append([i, id])
                self.dist_same_c.append([id, i])

        for pair in self.dist_same_c:
            cur = self.dists[pair[0]][pair[1]]
            if cur > maximum_same_c:
                if pair not in delete_from_same:
                    maximum_same_c = cur
        for i in range(n_clusters - 1):
            for j in range(i + 1, n_clusters):
                self.delta[i][j] *= (prev_cluster_sizes[i] *
                                     prev_cluster_sizes[j])
        for i in range(len(labels)):
            if labels[i] != k and id < i:
                self.delta[k][labels[i]] -= self.dists[id][i]
            if labels[i] != k and id > i:
                self.delta[labels[i]][k] -= self.dists[i][id]
            if labels[i] != l and id < i:
                self.delta[l][labels[i]] += self.dists[id][i]
            if labels[i] != l and id > i:
                self.delta[labels[i]][l] += self.dists[i][id]
        for i in range(n_clusters - 1):
            for j in range(i + 1, n_clusters):
                self.delta[i][j] /= float(self.cluster_sizes[i] *
                                          self.cluster_sizes[j])
                if self.delta[i][j] != 0:
                    minimum_dif_c = min(minimum_dif_c, self.delta[i][j])
        return -minimum_dif_c / maximum_same_c
Example #27
0
def davies_bouldin(X, n_clusters, labels):
    centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
    db = 0
    point_in_c = cluster_centroid.count_cluster_sizes(labels, n_clusters)
    tmp = sys.float_info.min
    for i in range(0, n_clusters):
        for j in range(0, n_clusters):
            if i != j:
                tm = utils.euclidian_dist(centroids[i], centroids[j])
                if tm != 0:
                    a = (s(X, i, point_in_c, labels, centroids) +
                         s(X, j, point_in_c, labels, centroids)) / tm
                else:
                    pass
                    #a = -Constants.bad_cluster
                tmp = max(tmp, a)
        db += tmp
    db /= float(n_clusters)
    return db
Example #28
0
    def update(self, X, n_clusters, labels, k, l, id):
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), X[id], k, l)
        self.a_ss[id] = self.a(X, labels, id, l)
        self.b_ss[id] = self.b(X, n_clusters, labels, id, l)
        for i in range(len(labels)):
            if i == id:
                continue
            if labels[i] == k:
                self.a_ss[i] *= (self.cluster_sizes[k] + 1)
                self.a_ss[i] -= self.dists_e[i][id]
                if self.cluster_sizes[k] == 0:
                    self.a_ss[i] = float('inf')
                else:
                    self.a_ss[i] /= self.cluster_sizes[k]
            if labels[i] == l:
                self.a_ss[i] *= (self.cluster_sizes[l] - 1)
                self.a_ss[i] += self.dists_e[i][id]
                if self.cluster_sizes[l] == 0:
                    self.a_ss[i] = float('inf')
                else:
                    self.a_ss[i] /= self.cluster_sizes[l]
            self.dists_for_b[i][l] *= (self.cluster_sizes[l] - 1)
            self.dists_for_b[i][l] += self.dists_e[i][id]
            if self.cluster_sizes[l] == 0:
                self.dists_for_b[i][l] = float('inf')
            else:
                self.dists_for_b[i][l] /= self.cluster_sizes[l]
            self.dists_for_b[i][k] *= (self.cluster_sizes[k] + 1)
            self.dists_for_b[i][k] -= self.dists_e[i][id]
            if self.cluster_sizes[k] == 0:
                self.dists_for_b[i][k] = float('inf')
            else:
                self.dists_for_b[i][k] /= self.cluster_sizes[k]
            self.b_ss[i] = min(self.dists_for_b[i])

        ch = 0
        for i in range(len(labels)):
            ch += (self.b_ss[i] - self.a_ss[i]) / max(self.b_ss[i],
                                                      self.a_ss[i])
        return -(ch / float(len(labels)))
Example #29
0
    def find(self, X, labels, n_clusters):
        self.diameter = utils.find_diameter(X)
        self.centroids = cluster_centroid.cluster_centroid(
            X, labels, n_clusters)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.numerators = [0.0] * n_clusters
        for i in range(0, len(labels)):
            self.numerators[labels[i]] += utils.euclidian_dist(
                X[i], self.centroids[labels[i]])

        self.inner_max_dists = [[0 for _ in range(len(labels))]
                                for _ in range(len(labels))]
        self.outer_min_dists = [[
            sys.float_info.max for _ in range(len(labels))
        ] for _ in range(n_clusters)]
        self.accumulator = [0 for _ in range(n_clusters)]
        for k in range(0, n_clusters):
            for i in range(len(labels)):  # iterate elements outside cluster
                if labels[i] == k:
                    continue
                for j in range(len(labels)):  # iterate inside cluster
                    if labels[j] != k:
                        continue
                    self.inner_max_dists[i][j] = utils.euclidian_dist(
                        X[i], X[j])
                    self.inner_max_dists[j][i] = self.inner_max_dists[i][j]

        for c in range(n_clusters):
            for i in range(len(labels)):
                if labels[i] == c:
                    continue
                inner_max_dist = 0
                for j in range(len(self.inner_max_dists[i])):
                    if labels[j] == c:
                        inner_max_dist = max(inner_max_dist,
                                             self.inner_max_dists[i][j])
                if inner_max_dist != 0:
                    self.outer_min_dists[c][i] = inner_max_dist
            outer_min_dist = np.amin(self.outer_min_dists[c])
            self.accumulator[c] = self.numerators[c] / outer_min_dist
        return sum(self.accumulator) / len(labels)
Example #30
0
    def find(self, X, labels, n_clusters):
        self.diameter = utils.find_diameter(X)
        self.dist_centroids = [[0 for _ in range(n_clusters)]
                               for _ in range(n_clusters)]
        self.dist_ps = [0 for _ in range(len(labels))]
        self.centroids = cluster_centroid.cluster_centroid(
            X, labels, n_clusters)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)

        for k in range(0, n_clusters - 1):
            for l in range(k + 1, n_clusters):
                self.dist_centroids[k][l] = utils.euclidian_dist(
                    self.centroids[k], self.centroids[l])
        numerator = np.amax(self.dist_centroids)
        for i in range(0, len(labels)):
            self.dist_ps[i] = utils.d_ps(X, labels, X[i], labels[i],
                                         self.centroids)
        denominator = sum(self.dist_ps)
        return -(numerator / (denominator * n_clusters))