def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = points.evaluate() num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) #labels = expr.zeros((points.shape[0],), dtype=np.int) for iter in range(num_iter): centers = centers.glom() fuzzy = expr.map2(points, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers, "m": m}, shape=(points.shape[0], centers.shape[0])) labels = expr.argmax(fuzzy, axis=1) new_centers = expr.map2((points, fuzzy), (0, 0), fn=kmeans_map2_center_mapper, fn_kw={"centers": centers, "m": m}, shape=(centers.shape[0], centers.shape[1]), reducer=np.add) new_centers /= expr.sum(fuzzy ** m, axis=0)[:, expr.newaxis] centers = new_centers return labels
def cholesky(A): ''' Cholesky matrix decomposition. Args: A(Expr): matrix to be decomposed ''' A = expr.force(A) n = int(math.sqrt(len(A.tiles))) tile_size = A.shape[0] / n for k in range(n): # A[k,k] = DPOTRF(A[k,k]) diag_ex = get_ex(k, k, tile_size, A.shape) A = expr.map2(A, ((0, 1), ), fn=_cholesky_dpotrf_mapper, shape=A.shape, update_region=diag_ex) if k == n - 1: break # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n) col_ex = extent.create(((k + 1) * tile_size, k * tile_size), (n * tile_size, (k + 1) * tile_size), A.shape) diag_tile = A.force().fetch(diag_ex) A = expr.map2(A, ((0, 1), ), fn=_cholesky_dtrsm_mapper, fn_kw=dict(array=force(A), diag_tile=diag_tile), shape=A.shape, update_region=col_ex) # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n) # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n) col_exs = list([ extent.create((m * tile_size, m * tile_size), (n * tile_size, (m + 1) * tile_size), A.shape) for m in range(k + 1, n) ]) dgemm_1 = expr.transpose(A)[(k * tile_size):((k + 1) * tile_size), :] dgemm_2 = A[:, (k * tile_size):((k + 1) * tile_size)] A = expr.map2((A, dgemm_1, dgemm_2), ((0, 1), 1, 0), fn=_cholesky_dsyrk_dgemm_mapper, fn_kw=dict(array=force(A), k=k), shape=A.shape, update_region=col_exs) # update the right corner to 0 col_exs = list([ extent.create((0, m * tile_size), (m * tile_size, (m + 1) * tile_size), A.shape) for m in range(1, n) ]) A = expr.map2(A, ((0, 1), ), fn=_zero_mapper, shape=A.shape, update_region=col_exs) return A
def cholesky(A): ''' Cholesky matrix decomposition. Args: A(Expr): matrix to be decomposed ''' n = int(math.sqrt(FLAGS.num_workers)) tile_size = A.shape[0] / n print n, tile_size for k in range(n): # A[k,k] = DPOTRF(A[k,k]) diag_ex = get_ex(k, k, tile_size, A.shape) A = expr.map2(A, ((0, 1), ), fn=_cholesky_dpotrf_mapper, shape=A.shape, update_region=diag_ex) if k == n - 1: break # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n) col_ex = extent.create(((k + 1) * tile_size, k * tile_size), (n * tile_size, (k + 1) * tile_size), A.shape) A = expr.map2((A, A[diag_ex.to_slice()]), ((0, 1), None), fn=_cholesky_dtrsm_mapper, shape=A.shape, update_region=col_ex) # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n) # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n) col_exs = list([ extent.create((m * tile_size, m * tile_size), (n * tile_size, (m + 1) * tile_size), A.shape) for m in range(k + 1, n) ]) dgemm = A[:, (k * tile_size):((k + 1) * tile_size)] A = expr.map2((A, expr.transpose(dgemm), dgemm), ((0, 1), 1, 0), fn=_cholesky_dsyrk_dgemm_mapper, shape=A.shape, update_region=col_exs).optimized() # update the right corner to 0 col_exs = list([ extent.create((0, m * tile_size), (m * tile_size, (m + 1) * tile_size), A.shape) for m in range(1, n) ]) A = expr.map2(A, ((0, 1), ), fn=_zero_mapper, shape=A.shape, update_region=col_exs) return A
def cholesky(A): ''' Cholesky matrix decomposition. Args: A(Expr): matrix to be decomposed ''' n = int(math.sqrt(FLAGS.num_workers)) tile_size = A.shape[0] / n print n, tile_size for k in range(n): # A[k,k] = DPOTRF(A[k,k]) diag_ex = get_ex(k, k, tile_size, A.shape) A = expr.map2(A, ((0, 1), ), fn=_cholesky_dpotrf_mapper, shape=A.shape, update_region=diag_ex) if k == n - 1: break # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n) col_ex = extent.create(((k+1)*tile_size, k*tile_size), (n*tile_size, (k+1)*tile_size), A.shape) A = expr.map2((A, A[diag_ex.to_slice()]), ((0, 1), None), fn=_cholesky_dtrsm_mapper, shape=A.shape, update_region=col_ex) # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n) # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n) col_exs = list([extent.create((m*tile_size, m*tile_size), (n*tile_size, (m+1)*tile_size), A.shape) for m in range(k+1, n)]) dgemm = A[:, (k * tile_size):((k + 1) * tile_size)] A = expr.map2((A, expr.transpose(dgemm), dgemm), ((0, 1), 1, 0), fn=_cholesky_dsyrk_dgemm_mapper, shape=A.shape, update_region=col_exs).optimized() # update the right corner to 0 col_exs = list([extent.create((0, m*tile_size), (m*tile_size, (m+1)*tile_size), A.shape) for m in range(1, n)]) A = expr.map2(A, ((0, 1), ), fn=_zero_mapper, shape=A.shape, update_region=col_exs) return A
def fit(self, X, centers=None, implementation='outer'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0],)) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={'d_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels}, shape_hint=(1,), cost_hint={hash(labels): {'00': 0, '01': np.prod(labels.shape)}}) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def fit(self, X, centers=None, implementation='map2'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0], )) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum( X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={ 'd_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels }, shape_hint=(1, ), cost_hint={ hash(labels): { '00': 0, '01': np.prod(labels.shape) } }) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn( n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels