def compute(self, kmpp=True): if kmpp: self.centers = KMeanspp(self.p, self.k, self.w.ravel(), n_init=self.ni).compute() else: self.centers = self._rand_seeds() np.reshape(self.centers, (self.k, self.p.shape[1])) #just fix the shape dist, d = utils.get_centers_d(self.p, self.centers) xcost = np.sum(utils.get_dist_to_centers(self.p, self.centers, d)*self.w) points = self.p weights = self.w.T for j in range(0, self.e): for i in range(0, self.k): x = [dist == i] a = points[x] w = weights[x] c = a*w new_center = np.sum(c, axis=0, keepdims=1) if np.sum(w) == 0: print "not nice" continue new_center /= np.sum(w) self.centers[i] = new_center dist, d = utils.get_centers_d(self.p, self.centers) cost = np.sum(utils.get_dist_to_centers(self.p, self.centers, d)*self.w) if abs(xcost - cost) < self.epsilon: break xcost = cost return self.centers
def compute(self): best_cent = self.seed() if self.n_init == 1: return best_cent best_cost = np.sum(utils.get_dist_to_centers(self.p, best_cent) * self.w) for i in range(self.n_init - 1): cent = self.seed() cost = np.sum(utils.get_dist_to_centers(self.p, cent) * self.w) if cost < best_cost: best_cost = cost best_cent = cent return best_cent
def drop_half_points(self, points, weights, M): d = utils.get_dist_to_centers(points, M) median = np.median(d) points = points[d>median] if weights is not None: weights = weights[d>median] return points, weights
def compute(self, size): """ self.points is a vector with n rows and d cols bi its a vector of with klogn rows and d dols dist(i) represents the sens(p_i) as in the formula discussed. """ e = w_kmeans.Kmeans(self.points, np.expand_dims(self.weights, axis=0), self.k, 10) bi = e.compute() dist = utils.get_dist_to_centers(self.points, bi) #find distance of each point to its nearset cluster if self.weights is not None: # its always not none!!! dist /= np.sum(dist) #norm dist *= 2 c = utils.get_centers(self.points, bi)#get centers c = self.find_cluester_size_weighted(c, W=self.weights)#get weighted size of center's cluster dist += ((4.0)/(c)) #add to each point the size of its cluster as at the formula t = np.sum(dist*self.weights) weights = 1/(dist*size) weights *= t # print t dist *= self.weights dist /= np.sum(dist) prob = dist # its actually the sampling probability points, weights = utils.sample(self.points, prob, size, weights=weights) return points, weights
def compute(self, size, grnds=10, ginit=1): q = w_KMeans.KMeans(self.p, np.expand_dims(self.w , axis=0), self.k, grnds, ginit).compute() # this is my kmeans for the coreset. sq_d = utils.get_sq_distances(self.p, q) # Squared distances from each point to each center dist = utils.get_dist_to_centers(d=sq_d) # I get the sq dist from each point its center. dist /= np.sum(dist) # Norm dist *= 2 # according to the paper c = utils.get_centers(d=sq_d) # I get the index of the center c = self._find_cluster_size(c) # Find the size of the cluster for each point. s = dist + 4.0/c # I add it, the 4 is according to the paper. t = np.sum(s*self.w) # This is the t from the paper. u = t/(s*size) # the new weights for coreset. prob = s*self.w/t # the probability for sampling p, w = utils.sample(self.p, size, prob=prob, weights=u) # sample coreset: points + weights. return p, w
def drop_half_weighted_points(self, points, weights, M, W): left = W points_to_drop=[] d = utils.get_dist_to_centers(points, M) idx = np.argsort(d) i = 0 while left > 0: index = idx[i] if weights[index] > left: weights[index] -= left left = 0 else: left -= weights[index] points_to_drop.append(index) i += 1 points = np.delete(points,points_to_drop,axis=0) weights = np.delete(weights,points_to_drop) return points, weights
def _compute_cost(self, p, means): return np.sum(utils.get_dist_to_centers(p, means))
def mapForCost(arr): return np.sum(utils.get_dist_to_centers(arr, means1))
t = 50 delta = 100 print "regressing sample size in [50, 2000] w/ jumps of", delta, "each w/", t, "trials..." x = [] y = [] y_uni = [] for size in range(50, 2000, delta): c_mistake = 0 u_mistake = 0 x.append(size) print "size:", size, "trials", for i in range(0,t): s = np.random.choice(range(0,10030),size) s = p[s] centers = model.fit(s).cluster_centers_ uni_cost = (np.sum(utils.get_dist_to_centers(p, centers))) u_mistake += (1 - cost/uni_cost) p_cset, w_cset = Coreset(p, 2, w).compute(size) e = w_KMeans.KMeans(p_cset, np.expand_dims(w_cset, axis=0), 2, 10) e = e.compute() res = (np.sum(utils.get_dist_to_centers(p, e))) c_mistake += (1-cost/res) sys.stdout.write(".") sys.stdout.flush() c_mistake /= t y.append(c_mistake) u_mistake /= t y_uni.append(u_mistake) print "mistakes for uniform:", round(u_mistake, 3), "coreset:", round(c_mistake, 3) u_mistake = c_mistake = 0 plt.plot(x, y, 'r')
def distanceToClosest(p, centers): closest = float("+inf") for i in range(len(centers)): tempDist = np.sum((p - centers[i]) ** 2) if tempDist < closest: closest = tempDist return closest if __name__ == "__main__": points = np.loadtxt("coreset_points.txt",dtype=np.float64) weights = np.loadtxt("coreset_weights.txt",dtype=np.float64) org = np.loadtxt("small_dataset.txt",dtype=np.float64) k = 2 means = KMeans(points, np.expand_dims(weights, axis=0), k, rounds=20) means = means.compute() real_cost = (np.sum(utils.get_dist_to_centers(org, KMeans(org, np.expand_dims(np.ones(org.shape[0]), axis=0), k, rounds=20).compute()))) print real_cost sc = SparkContext(appName="test_results ") # start from here. sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "123") sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "456") points = sc.textFile("small_dataset.txt").map(parseVector) closest = points.map(lambda p: (distanceToClosest(p, means))) cs_result = closest.reduce(lambda a, b: a+b) print cs_result print "mistake: ", (1-real_cost/cs_result) sc.stop()