def seed(self): k = self.k - 1 centers = [] prob = self.w / np.sum(self.w) center = utils.sample(self.p, 1, prob) centers.append(center[0]) min_dist = None while k > 0: np_centers = np.array(centers) if min_dist is None: d = utils.get_sq_distances(x=self.p, y=np_centers).ravel() min_dist = d else: d = utils.get_sq_distances(x=self.p, y=np.array([np_centers[-1]])).ravel() min_dist = np.minimum(min_dist, d) dist = np.array(min_dist) dist *= self.w prob = dist / np.sum(dist) center = utils.sample(self.p, 1, prob) centers.append(center[0]) k -= 1 return np.array(centers, dtype=np.float64)
def points_cost(self, points, centers): """ this function will return the minimal distance of each point from its closest center :param points: a list of points with dimension d :param centers: a list of centers from which we'll take the miniaml distance :return: the minimal distance of each point from its closest center """ minDist = float("inf") for center in centers: center = [np.array(center)] tmpDistances = utils.get_sq_distances(x=points, y=center) minDist = np.minimum(minDist, np.amin(tmpDistances)) return minDist
def compute(self, size, grnds=10, ginit=1): q = w_KMeans.KMeans(self.p, np.expand_dims(self.w , axis=0), self.k, grnds, ginit).compute() # this is my kmeans for the coreset. sq_d = utils.get_sq_distances(self.p, q) # Squared distances from each point to each center dist = utils.get_dist_to_centers(d=sq_d) # I get the sq dist from each point its center. dist /= np.sum(dist) # Norm dist *= 2 # according to the paper c = utils.get_centers(d=sq_d) # I get the index of the center c = self._find_cluster_size(c) # Find the size of the cluster for each point. s = dist + 4.0/c # I add it, the 4 is according to the paper. t = np.sum(s*self.w) # This is the t from the paper. u = t/(s*size) # the new weights for coreset. prob = s*self.w/t # the probability for sampling p, w = utils.sample(self.p, size, prob=prob, weights=u) # sample coreset: points + weights. return p, w
def sample_independently_bahman(self, points, centers, overSamplingFactor): """ will return a set of center candidates using formulat described in calling function :param points: :param centers: :param overSamplingFactor: how many new centers will we sample :return: center candidates """ C_prime = [] sq_min_dist_array = utils.get_sq_distances(x=points, y=centers).ravel() phy_x_c = sum(sq_min_dist_array) if phy_x_c is 0: phy_x_c = 0.0000001 #used to handle a singular case where all points are the same point for i in range(0, len(points) - 1): tmp = sq_min_dist_array[i] p_x = overSamplingFactor*sq_min_dist_array[i]/phy_x_c rand_x=random() if rand_x <= p_x: C_prime.append(points[i]) return list(set(C_prime)) #set removes duplicates