Example #1
0
    def kmeans_plus_plus(self, num_clusters=100, exponent=2):
        '''
        Select k initial clusters by kmeans++ scheme, see https://en.wikipedia.org/wiki/K-means%2B%2B
        '''
        X = self.data
        n, p = X.shape
        clusters = np.zeros([num_clusters, p])
        first_row = np.random.randint(0, n)
        clusters[0] = X[first_row]
        X = np.delete(X, first_row, 0)

        print "Starting kmeans++ initialization..."
        for i in range(1, num_clusters):
            index = 0
            rank = 0
            for j in range(len(X)):
                sample = X[j]
                biggest_distance_to_cluster = 0
                for k in range(i):
                    distance = stats_utils.euclidean_distance(clusters[k], sample)
                    if distance > biggest_distance_to_cluster:
                        biggest_distance_to_cluster = distance
                weight = biggest_distance_to_cluster ** exponent
                new_rank = pow(np.random.rand(), 1 / weight) # weighted reservoir sampling
                if new_rank > rank:
                    rank = new_rank
                    index = j
            clusters[i] = X[index]
            X = np.delete(X, index, 0)
        return clusters
Example #2
0
 def get_closest_cluster_index(self, sample, clusters):
     smallest_distance = np.Inf
     index = None
     for k in range(len(clusters)):
         distance = stats_utils.euclidean_distance(clusters[k], sample)
         if distance < smallest_distance:
             smallest_distance = distance
             index = k
     return index