Esempio n. 1
0
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
        self.numerators[k] = 0.0
        self.numerators[l] = 0.0
        for i in range(len(labels)):
            if labels[i] == k or labels[i] == l:
                self.numerators[labels[i]] += utils.euclidian_dist(
                    X[i], self.centroids[labels[i]])
        for i in range(len(labels)):
            if labels[i] == k:
                self.inner_max_dists[i][id] = utils.euclidian_dist(X[i], X[id])
                self.inner_max_dists[id][i] = self.inner_max_dists[i][id]
            if labels[i] == l:
                self.inner_max_dists[i][id] = 0
                self.inner_max_dists[id][i] = 0
                self.outer_min_dists[l][id] = sys.float_info.max
        for c in [k, l]:
            for i in range(len(labels)):
                if labels[i] == c:
                    continue
                inner_max_dist = 0
                for j in range(len(self.inner_max_dists[i])):
                    if labels[j] == c:
                        inner_max_dist = max(inner_max_dist,
                                             self.inner_max_dists[i][j])

                if inner_max_dist != 0:
                    self.outer_min_dists[c][i] = inner_max_dist
            outer_min_dist = np.amin(self.outer_min_dists[c])
            self.accumulator[c] = self.numerators[c] / outer_min_dist
        return sum(self.accumulator) / len(labels)
Esempio n. 2
0
    def update(self, X, n_clusters, labels, k, l, id):
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.cluster_sizes),
                                                                                 X[id], k, l)
        #self.cluster_sizes[k] -= 1
        #self.cluster_sizes[l] += 1
        for i in range(len(labels)):
            if labels[i] == k:
                self.s_c -= utils.euclidian_dist(X[i], X[id])
            if labels[i] == l:
                self.s_c += utils.euclidian_dist(X[i], X[id])
        prev_n_w = self.n_w
        self.n_w = self.n_w - (self.cluster_sizes[k] + 1) * self.cluster_sizes[k] / 2 + self.cluster_sizes[k] * (self.cluster_sizes[k] - 1) / 2 \
                - (self.cluster_sizes[l] - 1) * (self.cluster_sizes[l] - 2) / 2 + self.cluster_sizes[l] * (self.cluster_sizes[l] - 1) / 2

        delta = 0.1
        #print(prev_n_w)
        #print(self.n_w)
        #print(delta * len(labels))

        if abs(self.n_w - prev_n_w) > delta * len(labels):
            self.s_min = heapq.nsmallest(int(self.n_w), self.distances)
            self.s_max = heapq.nlargest(int(self.n_w), self.distances)

        #ones = [1] * int(self.n_w)
        #s_min_c = np.dot(self.s_min, np.transpose(ones))
        #s_max_c = np.dot(self.s_max, np.transpose(ones))
        s_min_c = sum(self.s_min)
        s_max_c = sum(self.s_max)
        return (self.s_c - s_min_c) / (s_max_c - s_min_c)
Esempio n. 3
0
 def find(self, X, labels, n_clusters):
     self.diameter = utils.find_diameter(X)
     self.centroids = cluster_centroid.cluster_centroid(
         X, labels, n_clusters)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(
         labels, n_clusters)
     self.dists = [[0 for _ in range(len(labels))]
                   for _ in range(len(labels))]
     self.centroid_dists = [0 for _ in range(len(labels))]
     self.delta = [[0 for _ in range(n_clusters)]
                   for _ in range(n_clusters)]
     minimum_dif_c = sys.float_info.max  # min dist in different clusters
     maximum_same_c = sys.float_info.min  # max dist in the same cluster
     self.sums = [0 for _ in range(n_clusters)]
     for i in range(len(labels)):
         self.centroid_dists[i] = utils.euclidian_dist(
             X[i], self.centroids[labels[i]])
         self.sums[labels[i]] += self.centroid_dists[i]
     for i in range(len(labels) - 1):
         for j in range(i + 1, len(labels)):
             self.dists[i][j] = utils.euclidian_dist(X[i], X[j])
             self.dists[j][i] = self.dists[i][j]
             self.dists_same_c.append([i, j])
             maximum_same_c = max(self.dists[i][j], maximum_same_c)
     for i in range(n_clusters):
         for j in range(n_clusters):
             if i != j:
                 self.delta[i][j] = (self.sums[i] + self.sums[j]) / float(
                     self.cluster_sizes[i] + self.cluster_sizes[j])
                 minimum_dif_c = min(minimum_dif_c, self.delta[i][j])
     return -(minimum_dif_c / maximum_same_c)
Esempio n. 4
0
def sv(X, labels, n_clusters):
    centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
    cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)

    numerator = 0.0
    for k in range(0, n_clusters - 1):
        min_dist = sys.float_info.max
        for l in range(k + 1, n_clusters):
            min_dist = min(min_dist,
                           utils.euclidian_dist(centroids[k], centroids[l]))
        numerator += min_dist

    denominator = 0.0
    for k in range(0, n_clusters):
        list = []
        for i in range(0, len(labels)):
            if labels[i] != k:
                continue
            list.append(utils.euclidian_dist(X[i], centroids[k]))

        # get sum of 0.1*|Ck| largest elements
        acc = 0.0
        max_n = heapq.nlargest(int(math.ceil(0.1 * cluster_sizes[k])), list)
        for i in range(0, len(max_n)):
            acc += max_n[i]
        denominator += acc * 10.0 / cluster_sizes[k]
    return -numerator / denominator
Esempio n. 5
0
def cs_index(X, labels, n_clusters):
    elements, ignore_columns = X.shape
    centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
    cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
    max_dists = [sys.float_info.min] * elements

    for i in range(0, elements):  # for every element
        for j in range(i, elements - 1):  # for every other
            if labels[i] != labels[j]:
                continue  # if they are in the same cluster
            # update the distance to the farthest element in the same cluster
            max_dists[i] = max(max_dists[i], utils.euclidian_dist(X[i], X[j]))

    # max_dists contain for each element the farthest the his cluster

    numerator = 0.0
    for i in range(0, elements):
        numerator += max_dists[i] / cluster_sizes[labels[i]]

    denominator = 0.0
    for i in range(0, n_clusters):
        min_centroids_dist = sys.float_info.max
        for j in range(i + 1, n_clusters):
            min_centroids_dist = min(
                utils.euclidian_dist(centroids[i], centroids[j]),
                min_centroids_dist)
        denominator += min_centroids_dist

    assert denominator != 0.0
    return numerator / denominator
Esempio n. 6
0
def dunn53(X, labels, n_clusters):
    centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
    rows, colums = X.shape
    dl = [0.0] * n_clusters
    d = np.array(dl)
    point_in_c = [0] * n_clusters
    for i in range(0, len(labels)):
        point_in_c[labels[i]] += 1
    delta_l = [[0.0] * n_clusters] * n_clusters
    delta = np.array(delta_l)
    minimum_dif_c = sys.float_info.max  # min dist in different clusters
    maximum_same_c = sys.float_info.min  # max dist in the same cluster
    for i in range(0, int(math.ceil(float(rows) / 2.0))):
        for j in range(0, rows):
            if (labels[i] != labels[j]):
                delta[labels[i]][labels[j]] += (
                    utils.euclidian_dist(X[i], centroids[labels[i]]) +
                    utils.euclidian_dist(X[j], centroids[labels[j]]))
            else:
                d[labels[i]] += utils.euclidian_dist(X[i],
                                                     centroids[labels[i]])

    for i in range(0, n_clusters):
        d[i] /= point_in_c[i]
        d[i] += 2.0
        maximum_same_c = max(d[i], maximum_same_c)
        for j in range(0, n_clusters):
            delta[i][j] /= float(point_in_c[i] + point_in_c[j])
            minimum_dif_c = min(minimum_dif_c, delta[i][j])
    return -minimum_dif_c / maximum_same_c
Esempio n. 7
0
    def find(self, X, labels, n_clusters):
        self.diameter = utils.find_diameter(X)
        self.centroids = cluster_centroid.cluster_centroid(
            X, labels, n_clusters)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        rows, colums = X.shape
        self.sums = [0 for _ in range(n_clusters)]
        minimum_dif_c = sys.float_info.max  # min dist in different clusters
        centres_l = [[sys.float_info.max] * n_clusters] * n_clusters
        self.centers = np.array(centres_l)
        self.centroid_dists = [0 for _ in range(len(labels))]
        # self.centroid_dists = [utils.euclidian_dist(X[i], self.centroids[labels[i]]) for i in range(len(X))]
        for i in range(len(labels)):
            self.centroid_dists[i] = utils.euclidian_dist(
                X[i], self.centroids[labels[i]])
            self.sums[labels[i]] += self.centroid_dists[i]
        for i in range(n_clusters):
            for j in range(n_clusters):
                if i != j:
                    self.centers[i][j] = utils.euclidian_dist(
                        self.centroids[i], self.centroids[j])
        for i in range(rows):
            for j in range(rows):
                if labels[i] != labels[j]:
                    dist = self.centers[labels[i]][labels[j]]
                    minimum_dif_c = min(dist, minimum_dif_c)

        denominator = list(self.sums)
        for i in range(n_clusters):
            denominator[i] *= (2 / self.cluster_sizes[i])

        return -(minimum_dif_c / max(denominator))
Esempio n. 8
0
def dunn43(X, labels, n_clusters):
    centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
    rows, colums = X.shape
    point_in_c = [0] * n_clusters
    for i in range(0, len(labels)):
        point_in_c[labels[i]] += 1
    dl = [0.0] * n_clusters
    d = np.array(dl)
    minimum_dif_c = sys.float_info.max  # min dist in different clusters
    maximum_same_c = sys.float_info.min  # max dist in the same cluster
    centres_l = [[0.0] * n_clusters] * n_clusters
    centers = np.array(centres_l)
    for i in range(0, n_clusters):
        for j in range(0, n_clusters):
            centers[i][j] = utils.euclidian_dist(centroids[i], centroids[j])

    for i in range(0, rows):
        for j in range(0, rows):
            if labels[i] != labels[j]:
                dist = centers[labels[i]][labels[j]]
                minimum_dif_c = min(dist, minimum_dif_c)
            else:
                d[labels[i]] += utils.euclidian_dist(X[i],
                                                     centroids[labels[i]])

    for i in range(0, n_clusters):
        d[i] /= point_in_c[i]
        d[i] += 2.0
        maximum_same_c = max(d[i], maximum_same_c)
    return -minimum_dif_c / maximum_same_c
Esempio n. 9
0
 def update(self, X, n_clusters, labels, k, l, id):
     point = X[id]
     prev_centroids = np.copy(self.centroids)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(
         labels, n_clusters)
     self.centroids = cluster_centroid.update_centroids(
         np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
     for i in range(n_clusters):
         if i > k:
             self.dist_centroids[k][i] = utils.euclidian_dist(
                 self.centroids[i], self.centroids[k])
         if i > l:
             self.dist_centroids[l][i] = utils.euclidian_dist(
                 self.centroids[i], self.centroids[l])
     numerator = np.amax(self.dist_centroids)
     delta = 10**(-math.log(len(X), 10) - 1)
     for i in range(len(labels)):
         if (labels[i] == k and
                 utils.euclidian_dist(prev_centroids[k], self.centroids[k])
                 > delta * self.diameter or labels[i] == l and
                 utils.euclidian_dist(prev_centroids[l], self.centroids[l])
                 > delta * self.diameter):
             self.dist_ps[i] = utils.d_ps(X, labels, X[i], labels[i],
                                          self.centroids)
     denominator = sum(self.dist_ps)
     return -(numerator / (denominator * n_clusters))
Esempio n. 10
0
def c_ind(X, labels, n_clusters):
    rows, colums = X.shape
    s_c = 0
    for i in range(0, rows):
        for j in range(0, int(math.ceil(float(rows) / 2.0))):
            s_c += utils.euclidian_dist(X[i], X[j])
    cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)

    n_w = 0
    for k in range(0, n_clusters):
        n_w += cluster_sizes[k] * (cluster_sizes[k] - 1) / 2

    distances = []
    for i in range(0, len(labels) - 1):
        for j in range(i + 1, len(labels)):
            distances.append(utils.euclidian_dist(X[i], X[j]))

    s_min = heapq.nsmallest(int(n_w), distances)
    s_max = heapq.nlargest(int(n_w), distances)

    ones = [1] * int(n_w)
    s_min_c = np.dot(s_min, np.transpose(ones))
    s_max_c = np.dot(s_max, np.transpose(ones))
    # TODO check dot product correct
    return (s_c - s_min_c) / (s_max_c - s_min_c)
Esempio n. 11
0
 def update(self, X, n_clusters, labels, k, l, id):
     point = X[id]
     prev_centroids = np.copy(self.centroids)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
     self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
     for i in range(n_clusters):
         if i > k:
             self.centroid_dists[k][i] = utils.euclidian_dist(self.centroids[i], self.centroids[k])
             self.centroid_dists[i][k] = self.centroid_dists[k][i]
         if i > l:
             self.centroid_dists[l][i] = utils.euclidian_dist(self.centroids[i], self.centroids[l])
             self.centroid_dists[i][l] = self.centroid_dists[l][i]
     numerator = 0.0
     for i in range(n_clusters):
         min_dist = np.amin(self.centroid_dists[i])
         numerator += min_dist
     denominator = 0.0
     self.dists[k][id] = 0.
     delta = 10**(-math.log(len(X), 10) - 1)
     for i in range(len(labels)):
         if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter
            or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter):
             self.dists[labels[i]][i] = utils.euclidian_dist(X[i], self.centroids[labels[i]])
     for c in range(n_clusters):
         # get sum of 0.1*|Ck| largest elements
         acc = 0.0
         max_n = heapq.nlargest(int(math.ceil(0.1 * self.cluster_sizes[c])), self.dists[c])
         for i in range(0, len(max_n)):
             acc += max_n[i]
         denominator += acc * 10.0 / self.cluster_sizes[c]
     return -(numerator / denominator)
Esempio n. 12
0
    def find(self, X, labels, n_clusters):
        self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
        self.diameter = utils.find_diameter(X)
        self.cluster_sizes = []
        self.distances = []
        self.s_c = 0
        self.n_w = 0
        rows, colums = X.shape
        for i in range(rows - 1):
            for j in range(i + 1, rows):
                if labels[i] == labels[j]:
                    self.s_c += utils.euclidian_dist(X[i], X[j])
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)

        for k in range(0, n_clusters):
            self.n_w += self.cluster_sizes[k] * (self.cluster_sizes[k] - 1) / 2

        for i in range(0, len(labels) - 1):
            for j in range(i + 1, len(labels)):
                self.distances.append(utils.euclidian_dist(X[i], X[j]))

        self.s_min = heapq.nsmallest(int(self.n_w), self.distances)
        self.s_max = heapq.nlargest(int(self.n_w), self.distances)

        #ones = [1] * int(self.n_w)
        #s_min_c = np.dot(self.s_min, np.transpose(ones))
        #s_max_c = np.dot(self.s_max, np.transpose(ones))
        s_min_c = sum(self.s_min)
        s_max_c = sum(self.s_max)
        return (self.s_c - s_min_c) / (s_max_c - s_min_c)
Esempio n. 13
0
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        prev_centroids = np.copy(self.centroids)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
        delta = 10**(-math.log(len(X), 10) - 1)
        if utils.euclidian_dist(prev_centroids[k],
                                self.centroids[k]) > delta * self.diameter:
            self.sigmas[k] = self.normed_cluster_sigma(X, labels, k)
        if utils.euclidian_dist(prev_centroids[l],
                                self.centroids[l]) > delta * self.diameter:
            self.sigmas[l] = self.normed_cluster_sigma(X, labels, l)
        term1 = sum(self.sigmas) / (n_clusters * self.normed_sigma_x)
        stdev_val = self.stdev(n_clusters)

        if (utils.euclidian_dist(prev_centroids[k],
                                 self.centroids[k]) > delta * self.diameter
                or utils.euclidian_dist(prev_centroids[l], self.centroids[l]) >
                delta * self.diameter):
            self.dens = 0.0
            for k in range(0, n_clusters):
                for l in range(0, n_clusters):
                    self.dens += self.den2(X, labels, self.centroids, k, l, stdev_val) /\
                            max(self.den1(X, labels, self.centroids, k, stdev_val),
                                self.den1(X, labels, self.centroids, l, stdev_val))

        self.dens /= n_clusters * (n_clusters - 1)
        return (term1 + self.dens)
Esempio n. 14
0
    def find(self, X, labels, n_clusters):
        self.diameter = utils.find_diameter(X)
        self.centroids = cluster_centroid.cluster_centroid(
            X, labels, n_clusters)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.dist_same_c = []
        rows, colums = X.shape
        self.dists = [[0. for _ in range(rows)] for _ in range(rows)]
        minimum_dif_c = sys.float_info.max  # min dist in different clusters
        maximum_same_c = sys.float_info.min  # max dist in the same cluster
        centres_l = [[sys.float_info.max] * n_clusters] * n_clusters
        self.centers = np.array(centres_l)
        for i in range(n_clusters):
            for j in range(n_clusters):
                if i != j:
                    self.centers[i][j] = utils.euclidian_dist(
                        self.centroids[i], self.centroids[j])

        for i in range(rows - 1):
            for j in range(i + 1, rows):
                self.dists[i][j] = utils.euclidian_dist(X[i], X[j])
                self.dists[j][i] = self.dists[i][j]
                if labels[i] != labels[j]:
                    dist = self.centers[labels[i]][labels[j]]
                    minimum_dif_c = min(dist, minimum_dif_c)
                else:
                    self.dist_same_c.append([i, j])
                    maximum_same_c = max(self.dists[i][j], maximum_same_c)
        return -(minimum_dif_c / maximum_same_c)
Esempio n. 15
0
 def find(self, X, labels, n_clusters):
     self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
     self.dists = [[0. for _ in range(len(labels))] for _ in range(len(labels))]
     self.sums = [0 for _ in range(n_clusters)]
     rows, colums = X.shape
     self.point_in_c = cluster_centroid.count_cluster_sizes(labels, n_clusters)
     self.delta_l = [[0.0] * n_clusters] * n_clusters
     self.delta = np.array(self.delta_l)
     self.centroid_dists = [0 for _ in range(len(labels))]
     #self.centroid_dists = [utils.euclidian_dist(X[i], self.centroids[labels[i]]) for i in range(len(X))]
     minimum_dif_c = sys.float_info.max
     for i in range(len(labels)):
         self.centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]])
         self.sums[labels[i]] += self.centroid_dists[i]
     for i in range(rows - 1):
         for j in range(i + 1, rows):
             self.dists[i][j] = utils.euclidian_dist(X[i], X[j])
             self.dists[j][i] = self.dists[i][j]
             if labels[i] != labels[j]:
                 self.delta[labels[i]][labels[j]] += self.dists[i][j]
     for i in range(n_clusters):
         for j in range(n_clusters):
             self.delta[i][j] /= float(self.point_in_c[i] * self.point_in_c[j])
             if self.delta[i][j] != 0:
                 minimum_dif_c = min(minimum_dif_c, self.delta[i][j])
         self.sums[i] *= (2 / self.point_in_c[i])
     #print(max(self.sums))
     return -(minimum_dif_c / max(self.sums))
Esempio n. 16
0
    def find(self, X, labels, n_clusters):
        self.diameter = utils.find_diameter(X)
        self.centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)

        self.centroid_dists = [[sys.float_info.max for _ in range(n_clusters)] for _ in range(n_clusters)]
        self.dists = [[0 for _ in range(len(labels))] for _ in range(n_clusters)]
        numerator = 0.0
        for k in range(0, n_clusters - 1):
            for l in range(k + 1, n_clusters):
                self.centroid_dists[k][l] = utils.euclidian_dist(self.centroids[k], self.centroids[l])
                self.centroid_dists[l][k] = self.centroid_dists[k][l]
        for i in range(n_clusters):
            min_dist = np.amin(self.centroid_dists[i])
            numerator += min_dist
        denominator = 0.0

        for k in range(n_clusters):
            for i in range(len(labels)):
                if labels[i] != k:
                    continue
                self.dists[k][i] = utils.euclidian_dist(X[i], self.centroids[k])
        for k in range(n_clusters):
            # get sum of 0.1*|Ck| largest elements
            acc = 0.0
            max_n = heapq.nlargest(int(math.ceil(0.1 * self.cluster_sizes[k])), self.dists[k])
            for i in range(0, len(max_n)):
                acc += max_n[i]
            denominator += acc * 10.0 / self.cluster_sizes[k]
        return -(numerator / denominator)
Esempio n. 17
0
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        prev_cluster_sizes = list(self.cluster_sizes)
        prev_centroids = np.copy(self.centroids)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(np.copy(labels), n_clusters)
        self.centroids = cluster_centroid.update_centroids(self.centroids, self.cluster_sizes, point, k, l)
        minimum_dif_c = sys.float_info.max  # min dist in different clusters

        #update numerator

        new_centroid_dists = list(self.centroid_dists)
        dell = 10**(-math.log(len(X), 10) - 1)
        for i in range(len(labels)):
            if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > dell * self.diameter
               or labels[i] == l and utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > dell * self.diameter):
                new_centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]])


        for i in range(n_clusters):
            for j in range(n_clusters):
                self.delta[i][j] *= (prev_cluster_sizes[i] + prev_cluster_sizes[j])

        new_sums = [0 for _ in range(n_clusters)]
        for i in range(n_clusters):
            if i != k and i != l:
                new_sums[i] = self.sums[i]
        for i in range(len(labels)):
            if labels[i] == k or labels[i] == l:
                new_sums[labels[i]] += new_centroid_dists[i]

        for i in range(n_clusters):
            for j in range(n_clusters):
                if i != j:
                    if self.cluster_sizes[i] + self.cluster_sizes[j] == 0:
                        self.delta[i][j] = float('inf')
                    else:
                        self.delta[i][j] = (new_sums[i] + new_sums[j]) / float(self.cluster_sizes[i] + self.cluster_sizes[j])
                    minimum_dif_c = min(minimum_dif_c, self.delta[i][j])


        #update denominator
        denominator = list(new_sums)
        #print(denominator)
        for i in range(n_clusters):
            if self.cluster_sizes[i] == 0:
                denominator[i] = float('inf')
            else:
                denominator[i] *= (2 / self.cluster_sizes[i])

        return -(minimum_dif_c / max(denominator))
Esempio n. 18
0
    def update(self, X, n_clusters, labels, k, l, id):
        self.diameter = utils.find_diameter(X)
        prev_point_in_c = list(self.point_in_c)
        prev_centroids = np.copy(self.centroids)
        self.point_in_c = cluster_centroid.count_cluster_sizes(labels, n_clusters)
        self.centroids = cluster_centroid.update_centroids(np.copy(self.centroids), np.copy(self.point_in_c), X[id], k, l)
        minimum_dif_c = sys.float_info.max  # min dist in different clusters

        #update numerator

        for i in range(n_clusters):
            for j in range(n_clusters):
                self.delta[i][j] *= (prev_point_in_c[i] * prev_point_in_c[j])

        for i in range(len(labels)):
            if labels[i] != k and id < i:
                self.delta[k][labels[i]] -= self.dists[id][i]
            if labels[i] != k and id > i:
                self.delta[labels[i]][k] -= self.dists[i][id]
            if labels[i] != l and id < i:
                self.delta[l][labels[i]] += self.dists[id][i]
            if labels[i] != l and id > i:
                self.delta[labels[i]][l] += self.dists[i][id]

        for i in range(n_clusters - 1):
            for j in range(i + 1, n_clusters):
                self.delta[i][j] /= float(self.point_in_c[i] * self.point_in_c[j])
                if self.delta[i][j] != 0:
                    minimum_dif_c = min(minimum_dif_c, self.delta[i][j])
        # update denominator
        new_centroid_dists = list(self.centroid_dists)
        dell = 10 ** (-math.log(len(X), 10) - 1)
        for i in range(len(labels)):
            if (labels[i] == k and utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > dell * self.diameter
                or labels[i] == l and utils.euclidian_dist(prev_centroids[l],
                                                           self.centroids[l]) > dell * self.diameter):
                new_centroid_dists[i] = utils.euclidian_dist(X[i], self.centroids[labels[i]])
        new_sums = [0 for _ in range(n_clusters)]
        for i in range(n_clusters):
            if i != k and i != l:
                new_sums[i] = self.sums[i]
        for i in range(len(labels)):
            if labels[i] == k or labels[i] == l:
                new_sums[labels[i]] += new_centroid_dists[i]
        denominator = list(new_sums)
        for i in range(n_clusters):
            if self.point_in_c[i] != 0:
                denominator[i] *= (2 / self.point_in_c[i])
        return -(minimum_dif_c / max(denominator))
Esempio n. 19
0
def bcd_score(X, labels, n_clusters, centroids, cluster_sizes):
    mean_x = np.mean(X, axis=0)
    numerator = 0.0
    for k in range(0, n_clusters):
        numerator += cluster_sizes[k] * utils.euclidian_dist(
            centroids[k], mean_x)
    return numerator / len(labels) / n_clusters
Esempio n. 20
0
def os(X, labels, n_clusters):
    centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
    cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)

    numerator = 0.0
    for k in range(0, n_clusters):
        for i in range(0, len(labels)):
            if labels[i] != k: continue
            numerator += ov(X, labels, X[i], k)

    denominator = 0.0
    for k in range(0, n_clusters):
        l = []
        for i in range(0, len(labels)):
            if labels[i] != k:
                continue
            l.append(utils.euclidian_dist(X[i], centroids[k]))

        # get sum of 0.1*|Ck| largest elements
        acc = 0.0
        max_n = heapq.nlargest(int(math.ceil(0.1 * cluster_sizes[k])), l)
        for i in range(0, len(max_n)):
            acc += max_n[i]

        denominator += acc * 10.0 / cluster_sizes[k]

    return -numerator / denominator
Esempio n. 21
0
 def find(self, X, labels, n_clusters):
     self.centroids = cluster_centroid.cluster_centroid(
         X, labels, n_clusters)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(
         labels, n_clusters)
     self.diameter = utils.find_diameter(X)
     self.dists = [[0. for _ in range(len(labels))]
                   for _ in range(len(labels))]
     self.dist_same_c = []
     rows, colums = X.shape
     delta_l = [[0.0] * n_clusters] * n_clusters
     self.delta = np.array(delta_l)
     minimum_dif_c = sys.float_info.max  # min dist in different clusters
     maximum_same_c = sys.float_info.min  # max dist in the same cluster
     for i in range(rows - 1):
         for j in range(i + 1, rows):
             self.dists[i][j] = utils.euclidian_dist(X[i], X[j])
             self.dists[j][i] = self.dists[i][j]
             if labels[i] != labels[j]:
                 self.delta[labels[i]][labels[j]] += self.dists[i][j]
             else:
                 self.dist_same_c.append([i, j])
                 maximum_same_c = max(self.dists[i][j], maximum_same_c)
     for i in range(n_clusters - 1):
         for j in range(i + 1, n_clusters):
             self.delta[i][j] /= float(self.cluster_sizes[i] *
                                       self.cluster_sizes[j])
             if self.delta[i][j] != 0:
                 minimum_dif_c = min(minimum_dif_c, self.delta[i][j])
     return -minimum_dif_c / maximum_same_c
 def find(self, X, labels, n_clusters):
     self.diameter = utils.find_diameter(X)
     self.s_clusters = [0. for _ in range(n_clusters)]
     self.centroids = cluster_centroid.cluster_centroid(
         X, labels, n_clusters)
     db = 0
     self.points_in_clusters = cluster_centroid.count_cluster_sizes(
         labels, n_clusters)
     for i in range(n_clusters):
         self.s_clusters[i] = self.s(X, i, self.points_in_clusters, labels,
                                     self.centroids)
     self.sums = [[0 for _ in range(n_clusters)] for _ in range(n_clusters)]
     for i in range(0, n_clusters):
         for j in range(0, n_clusters):
             if i != j:
                 tm = utils.euclidian_dist(self.centroids[i],
                                           self.centroids[j])
                 if tm != 0:
                     self.sums[i][j] = (self.s_clusters[i] +
                                        self.s_clusters[j]) / tm
                 else:
                     pass
                     #a = -Constants.bad_cluster
         tmp = np.amax(self.sums[i])
         db += tmp
     db /= float(n_clusters)
     return db
Esempio n. 23
0
 def s(self, X, cluster_k_index, cluster_sizes, labels, centroids):
     sss = 0
     for i in range(0, len(labels)):
         if labels[i] == cluster_k_index:
             sss += utils.euclidian_dist(X[i], self.centroids[cluster_k_index])
     if self.cluster_sizes[cluster_k_index] == 0:
         return float('inf')
     return sss / self.cluster_sizes[cluster_k_index]
Esempio n. 24
0
    def update(self, X, n_clusters, labels, k, l, id):
        point = X[id]
        # prev_cluster_sizes = list(self.cluster_sizes)
        prev_centroids = np.copy(self.centroids)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            np.copy(labels), n_clusters)
        self.centroids = cluster_centroid.update_centroids(
            np.copy(self.centroids), np.copy(self.cluster_sizes), point, k, l)
        # update denominator

        new_centroid_dists = list(self.centroid_dists)
        dell = 10**(-math.log(len(X), 10) - 1)
        for i in range(len(labels)):
            if (labels[i] == k and
                    utils.euclidian_dist(prev_centroids[k], self.centroids[k])
                    > dell * self.diameter or labels[i] == l and
                    utils.euclidian_dist(prev_centroids[l], self.centroids[l])
                    > dell * self.diameter):
                new_centroid_dists[i] = utils.euclidian_dist(
                    X[i], self.centroids[labels[i]])
        new_sums = [0 for _ in range(n_clusters)]
        for i in range(n_clusters):
            if i != k and i != l:
                new_sums[i] = self.sums[i]
        for i in range(len(labels)):
            if labels[i] == k or labels[i] == l:
                new_sums[labels[i]] += new_centroid_dists[i]
        denominator = list(new_sums)
        for i in range(n_clusters):
            if self.cluster_sizes[i] != 0:
                denominator[i] *= (2 / self.cluster_sizes[i])

        # update numerator

        for i in range(n_clusters):
            if i != k:
                self.centers[i][k] = utils.euclidian_dist(
                    self.centroids[i], self.centroids[k])
                self.centers[k][i] = self.centers[i][k]
            if i != l:
                self.centers[i][l] = utils.euclidian_dist(
                    self.centroids[i], self.centroids[l])
                self.centers[l][i] = self.centers[i][l]

        minimum_dif_c = np.amin(self.centers)
        return -(minimum_dif_c / max(denominator))
Esempio n. 25
0
def a(X, labels, x_i, cluster_k_index):
    acc = 0.0
    count = 0
    for j in range(0, len(labels)):
        if labels[j] != cluster_k_index: continue
        acc += utils.euclidian_dist(x_i, X[j])
        count += 1
    return acc / count
Esempio n. 26
0
def dl(X, labels, distance, n_clusters):
    result = 0

    for k in range(0, n_clusters - 1):
        for l in range(k + 1, n_clusters):
            if labels[k] == labels[l]: continue
            # x_k and x_l different clusters:
            if utils.euclidian_dist(X[k], X[l]) < distance:
                result += 1
    return result
Esempio n. 27
0
    def find(self, X, labels, n_clusters):
        self.diameter = utils.find_diameter(X)
        self.centroids = cluster_centroid.cluster_centroid(
            X, labels, n_clusters)
        self.cluster_sizes = cluster_centroid.count_cluster_sizes(
            labels, n_clusters)
        self.numerators = [0.0] * n_clusters
        for i in range(0, len(labels)):
            self.numerators[labels[i]] += utils.euclidian_dist(
                X[i], self.centroids[labels[i]])

        self.inner_max_dists = [[0 for _ in range(len(labels))]
                                for _ in range(len(labels))]
        self.outer_min_dists = [[
            sys.float_info.max for _ in range(len(labels))
        ] for _ in range(n_clusters)]
        self.accumulator = [0 for _ in range(n_clusters)]
        for k in range(0, n_clusters):
            for i in range(len(labels)):  # iterate elements outside cluster
                if labels[i] == k:
                    continue
                for j in range(len(labels)):  # iterate inside cluster
                    if labels[j] != k:
                        continue
                    self.inner_max_dists[i][j] = utils.euclidian_dist(
                        X[i], X[j])
                    self.inner_max_dists[j][i] = self.inner_max_dists[i][j]

        for c in range(n_clusters):
            for i in range(len(labels)):
                if labels[i] == c:
                    continue
                inner_max_dist = 0
                for j in range(len(self.inner_max_dists[i])):
                    if labels[j] == c:
                        inner_max_dist = max(inner_max_dist,
                                             self.inner_max_dists[i][j])
                if inner_max_dist != 0:
                    self.outer_min_dists[c][i] = inner_max_dist
            outer_min_dist = np.amin(self.outer_min_dists[c])
            self.accumulator[c] = self.numerators[c] / outer_min_dist
        return sum(self.accumulator) / len(labels)
Esempio n. 28
0
def dunn41(X, labels, n_clusters):
    centroids = cluster_centroid.cluster_centroid(X, labels, n_clusters)
    rows, colums = X.shape
    minimum_dif_c = sys.float_info.max  # min dist in different clusters
    maximum_same_c = sys.float_info.min  # max dist in the same cluster
    centres_l = [[0.0] * n_clusters] * n_clusters
    centers = np.array(centres_l)
    for i in range(0, n_clusters):
        for j in range(0, n_clusters):
            centers[i][j] = utils.euclidian_dist(centroids[i], centroids[j])

    for i in range(0, int(math.ceil(float(rows) / 2.0))):
        for j in range(0, rows):
            if (labels[i] != labels[j]):
                dist = centers[labels[i]][labels[j]]
                minimum_dif_c = min(dist, minimum_dif_c)
            else:
                dist = utils.euclidian_dist(X[i], X[j])
                maximum_same_c = max(dist, maximum_same_c)
    return -minimum_dif_c / maximum_same_c
Esempio n. 29
0
 def update(self, X, n_clusters, labels, k, l, id):
     point = X[id]
     prev_centroids = np.copy(self.centroids)
     delta = 10**(-math.log(len(X), 10) - 1)
     self.cluster_sizes = cluster_centroid.count_cluster_sizes(labels, n_clusters)
     self.centroids = cluster_centroid.update_centroids(self.centroids, self.cluster_sizes, point, k, l)
     if utils.euclidian_dist(prev_centroids[k], self.centroids[k]) > delta * self.diameter:
         self.s_clusters[k] = self.s(X, k, self.cluster_sizes, labels, self.centroids)
     if utils.euclidian_dist(prev_centroids[l], self.centroids[l]) > delta * self.diameter:
         self.s_clusters[l] = self.s(X, l, self.cluster_sizes, labels, self.centroids)
     for i in range(n_clusters):
         if i > k:
             self.max_s_sum[k][i] = self.s_clusters[i] + self.s_clusters[k]
             self.min_centroids_dist[k][i] = utils.euclidian_dist(self.centroids[i], self.centroids[k])
         if i > l:
             self.max_s_sum[l][i] = self.s_clusters[i] + self.s_clusters[l]
             self.min_centroids_dist[l][i] = utils.euclidian_dist(self.centroids[i], self.centroids[l])
     numerator = 0.0
     for i in range(n_clusters):
         numerator += np.max(self.max_s_sum[i]) / np.min(self.min_centroids_dist[i])
     return numerator / n_clusters
Esempio n. 30
0
def dunn(X, labels):
    rows, colums = X.shape
    minimum_dif_c = sys.float_info.max  # min dist in different clusters
    maximum_same_c = sys.float_info.min  # max dist in the same cluster
    for i in range(0, int(math.ceil(float(rows) / 2.0))):
        for j in range(0, rows):
            dist = utils.euclidian_dist(X[i], X[j])
            if (labels[i] != labels[j]):
                minimum_dif_c = min(dist, minimum_dif_c)
            else:
                maximum_same_c = max(dist, maximum_same_c)
    return -minimum_dif_c / maximum_same_c