def _test_optimization_ordered(self): na = np.random.rand(1000, 1000) nb = np.random.rand(1000, 1000) a = expr.from_numpy(na) b = expr.from_numpy(nb) c = a - b d = a + c f = c[200:900, 200:900] g = d[200:900, 200:900] h = f - g i = f + h j = h[100:500, 100:500] k = i[100:500, 100:500] l = expr.dot(j, k) m = j + k n = k - l o = n - m q = o[100:200, 100:200] nc = na - nb nd = na + nc nf = nc[200:900, 200:900] ng = nd[200:900, 200:900] nh = nf - ng ni = nf + nh nj = nh[100:500, 100:500] nk = ni[100:500, 100:500] nl = np.dot(nj, nk) nm = nj + nk nn = nk - nl no = nn - nm nq = no[100:200, 100:200] Assert.all_eq(nq, q.optimized().glom(), tolerance=1e-10)
def test_assign_1d(self): b = np.random.randn(100) sp_b = from_numpy(b) #a[:] = b[:] copy entire array a = np.random.randn(100) region_a = np.s_[0:100] region_b = np.s_[0:100] sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom() a[region_a] = b[region_b] Assert.all_eq(sp_a, a) # a[0] = b[1] copy one value a = np.random.randn(100) region_a = np.s_[0] region_b = np.s_[1] sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom() a[region_a] = b[region_b] Assert.all_eq(sp_a, a) # a[0:10] = b[20:30] copy range of values a = np.random.randn(100) region_a = np.s_[0:10] region_b = np.s_[20:30] sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom() a[region_a] = b[region_b] Assert.all_eq(sp_a, a) # a[30:60] = b[:30] copy range of values, not starting from 0. a = np.random.randn(100) region_a = np.s_[0:10] region_b = np.s_[20:30] sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom() a[region_a] = b[region_b] Assert.all_eq(sp_a, a)
def _test_optimization_ordered(self): na = np.random.rand(1000, 1000) nb = np.random.rand(1000, 1000) a = expr.from_numpy(na) b = expr.from_numpy(nb) c = a - b d = a + c f = c[200:900, 200:900] g = d[200:900, 200:900] h = f - g i = f + h j = h[100:500, 100:500] k = i[100:500, 100:500] l = expr.dot(j, k) m = j + k n = k - l o = n - m q = o[100:200, 100:200] nc = na - nb nd = na + nc nf = nc[200:900, 200:900] ng = nd[200:900, 200:900] nh = nf - ng ni = nf + nh nj = nh[100:500, 100:500] nk = ni[100:500, 100:500] nl = np.dot(nj, nk) nm = nj + nk nn = nk - nl no = nn - nm nq = no[100:200, 100:200] Assert.all_eq(nq, q.optimized().glom(), tolerance = 1e-10)
def fit(self, X, y): """ Parameters ---------- X : array-like of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (integers that correspond to classes in classification, real numbers in regression). Returns ------- self : object Returns self. """ if isinstance(X, np.ndarray): X = expr.from_numpy(X) if isinstance(y, np.ndarray): y = expr.from_numpy(y) X = expr.force(X) y = expr.force(y) self.n_classes = np.unique(y.glom()).size ctx = blob_ctx.get() n_workers = ctx.num_workers _ = self._create_task_array(n_workers, self.n_estimators) task_array = expr.from_numpy(_, tile_hint=(1, )).force() target_array = expr.ndarray((task_array.shape[0], ), dtype=object, tile_hint=(1, )).force() results = task_array.foreach_tile(mapper_fn=_build_mapper, kw={ 'task_array': task_array, 'target_array': target_array, 'X': X, 'y': y, 'criterion': self.criterion, 'max_depth': self.max_depth, 'min_samples_split': self.min_samples_split, 'min_samples_leaf': self.min_samples_leaf, 'max_features': self.max_features, 'bootstrap': self.bootstrap }) # Target array stores the local random forest each worker builds, # it's used for further prediction. self.target_array = target_array return self
def test_transpose_dot(self): npa1 = np.random.random((401, 97)) npa2 = np.random.random((401, 97)) result1 = np.dot(npa1, np.transpose(npa2)) #result2 = np.dot(np.transpose(npa1), npa2) t1 = expr.from_numpy(npa1) t2 = expr.from_numpy(npa2) t3 = expr.dot(t1, expr.transpose(t2)) #t4 = expr.dot(expr.transpose(t1), t2) assert np.all(np.isclose(result1, t3.glom()))
def fit(self, X, centers=None): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if centers is None: centers = expr.from_numpy(np.random.rand(self.n_clusters, num_dim)) for i in range(self.n_iter): X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels '''
def fit(self, X, y): """ Parameters ---------- X : array-like of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (integers that correspond to classes in classification, real numbers in regression). Returns ------- self : object Returns self. """ if isinstance(X, np.ndarray): X = expr.from_numpy(X) if isinstance(y, np.ndarray): y = expr.from_numpy(y) X = X.evaluate() y = y.evaluate() self.n_classes = np.unique(y.glom()).size ctx = blob_ctx.get() n_workers = ctx.num_workers _ = self._create_task_array(n_workers, self.n_estimators) task_array = expr.from_numpy(_, tile_hint=(1, )).evaluate() target_array = expr.ndarray((task_array.shape[0], ), dtype=object, tile_hint=(1,)).evaluate() results = task_array.foreach_tile(mapper_fn=_build_mapper, kw={'task_array': task_array, 'target_array': target_array, 'X': X, 'y': y, 'criterion': self.criterion, 'max_depth': self.max_depth, 'min_samples_split': self.min_samples_split, 'min_samples_leaf': self.min_samples_leaf, 'max_features': self.max_features, 'bootstrap': self.bootstrap}) # Target array stores the local random forest each worker builds, # it's used for further prediction. self.target_array = target_array return self
def test_assign_array_like(self): a = np.zeros((20, 10)) b = np.ones((10, )) region = np.s_[10, ] sp_a = assign(from_numpy(a), region, b).glom() a[region] = b Assert.all_eq(sp_a, a)
def test_newaxis(self): na = np.arange(100).reshape(10, 10) a = expr.from_numpy(na) Assert.all_eq(na[np.newaxis, 2:7, 4:8].shape, a[expr.newaxis, 2:7, 4:8].shape) Assert.all_eq(na[np.newaxis, 2:7, np.newaxis, 4:8].shape, a[expr.newaxis, 2:7, expr.newaxis, 4:8].shape) Assert.all_eq( na[np.newaxis, 2:7, np.newaxis, 4:8, np.newaxis].shape, a[expr.newaxis, 2:7, expr.newaxis, 4:8, expr.newaxis].shape) #Extreme case Assert.all_eq( na[np.newaxis, np.newaxis, np.newaxis, np.newaxis, 2:7, np.newaxis, np.newaxis, np.newaxis, 4:8, np.newaxis, np.newaxis, np.newaxis].shape, a[expr.newaxis, expr.newaxis, expr.newaxis, expr.newaxis, 2:7, expr.newaxis, expr.newaxis, expr.newaxis, 4:8, expr.newaxis, expr.newaxis, expr.newaxis].shape) util.log_info( '\na.shape: %s \nna.shape: %s', a[expr.newaxis, 2:7, expr.newaxis, 4:8, expr.newaxis, expr.newaxis, expr.newaxis].shape, na[np.newaxis, 2:7, np.newaxis, 4:8, np.newaxis, np.newaxis, np.newaxis].shape)
def test_newaxis(self): na = np.arange(100).reshape(10, 10) a = expr.from_numpy(na) Assert.all_eq(na[np.newaxis, 2:7, 4:8].shape, a[expr.newaxis,2:7, 4:8].shape) Assert.all_eq(na[np.newaxis, 2:7, np.newaxis, 4:8].shape, a[expr.newaxis,2:7, expr.newaxis, 4:8].shape) Assert.all_eq(na[np.newaxis, 2:7, np.newaxis, 4:8, np.newaxis].shape, a[expr.newaxis,2:7, expr.newaxis, 4:8, expr.newaxis].shape) #Extreme case Assert.all_eq(na[np.newaxis, np.newaxis, np.newaxis, np.newaxis, 2:7, np.newaxis, np.newaxis, np.newaxis, 4:8, np.newaxis, np.newaxis, np.newaxis].shape, a[expr.newaxis, expr.newaxis, expr.newaxis, expr.newaxis, 2:7, expr.newaxis, expr.newaxis, expr.newaxis, 4:8, expr.newaxis, expr.newaxis, expr.newaxis].shape) util.log_info('\na.shape: %s \nna.shape: %s', a[expr.newaxis,2:7, expr.newaxis, 4:8, expr.newaxis, expr.newaxis, expr.newaxis].shape, na[np.newaxis, 2:7, np.newaxis, 4:8, np.newaxis, np.newaxis, np.newaxis].shape)
def kneighbors(self, X, n_neighbors=None): """Finds the K-neighbors of a point. Returns distance Parameters ---------- X : array-like, last dimension same as that of fit data The new point. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). Returns ------- dist : array Array representing the lengths to point, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. """ if n_neighbors is not None: self.n_neighbors = n_neighbors if isinstance(X, np.ndarray): X = expr.from_numpy(X) if self.algorithm in ('auto', 'brute'): X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) fit_X_broadcast = expr.reshape(self.X, (1, self.X.shape[0], self.X.shape[1])) distances = expr.sum((X_broadcast - fit_X_broadcast) ** 2, axis=2) neigh_ind = expr.argsort(distances, axis=1) neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom() neigh_dist = expr.sort(distances, axis=1) neigh_dist = expr.sqrt(neigh_dist[:, :n_neighbors]).optimized().glom() return neigh_dist, neigh_ind else: results = self.X.foreach_tile(mapper_fn=_knn_mapper, kw={'X': self.X, 'Q': X, 'n_neighbors': self.n_neighbors, 'algorithm': self.algorithm}) dist = None ind = None """ Get the KNN candidates for each tile of X, then find out the real KNN """ for k, v in results.iteritems(): if dist is None: dist = v[0] ind = v[1] else: dist = np.concatenate((dist, v[0]), axis=1) ind = np.concatenate((ind, v[1]), axis=1) mask = np.argsort(dist, axis=1)[:, :self.n_neighbors] new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)]) new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)]) return new_dist, new_ind
def fit(self, X): ctx = blob_ctx.get() if isinstance(X, np.ndarray): X = expr.from_numpy(X, tile_hint=(X.shape[0] / ctx.num_workers, X.shape[1])) if isinstance(X, expr.Expr): X = X.force() self.X = X return self
def test_from_np1d(self): npa = np.random.random((100, 100)) np.save('_test_write1', npa) np.savez('_test_write2', npa) t1 = expr.from_file('_test_write1.npy', sparse = False) t2 = expr.from_file('_test_write2.npz', sparse = False) t3 = expr.from_numpy(npa) Assert.all_eq(t1.glom(), npa) Assert.all_eq(t2.glom(), npa) Assert.all_eq(t3.glom(), npa) os.system('rm -rf _test_write1.npy _test_write2.npz')
def benchmark_ssvd(ctx, timer): DIM = (1280, 1280) #A = expr.randn(*DIM, dtype=np.float64) A = np.random.randn(*DIM) A = expr.from_numpy(A) t1 = datetime.now() U,S,VT = svd(A) t2 = datetime.now() cost_time = millis(t1, t2) print "total cost time:%s ms" % (cost_time)
def test_del_dim(self): na = np.arange(100).reshape(10, 10) a = expr.from_numpy(na) Assert.all_eq(na[2:7, 8], a[2:7, 8].glom()) Assert.all_eq(na[3:9, 4].shape, a[3:9, 4].shape) Assert.all_eq(na[2:7, -1], a[2:7, -1].glom()) Assert.all_eq(na[-1, 3:9].shape, a[-1, 3:9].shape) util.log_info('\na.shape: %s \nna.shape %s', a[3:9, 4].shape, na[3:9, 4].shape)
def benchmark_ssvd(ctx, timer): DIM = (1280, 1280) #A = expr.randn(*DIM, dtype=np.float64) A = np.random.randn(*DIM) A = expr.from_numpy(A) t1 = datetime.now() U, S, VT = svd(A) t2 = datetime.now() cost_time = millis(t1, t2) print "total cost time:%s ms" % (cost_time)
def benchmark_pca(ctx, timer): DIM = (1280, 512) data = np.random.randn(*DIM) A = expr.from_numpy(data) #A = expr.randn(*DIM, dtype=np.float64) t1 = datetime.now() m = PCA(N_COMPONENTS) m.fit(A) t2 = datetime.now() cost_time = millis(t1, t2) print "total cost time:%s ms" % (cost_time)
def test_pca(self): FLAGS.opt_parakeet_gen = 0 data = np.random.randn(*DIM) A = expr.from_numpy(data, tile_hint=util.calc_tile_hint(DIM, axis=0)) m = PCA(N_COMPONENTS) m2 = SK_PCA(N_COMPONENTS) m.fit(A) m2.fit(data) print m2.components_ - m.components_ assert np.allclose(absolute(m.components_), absolute(m2.components_))
def test_reshape_dot(self): npa1 = np.random.random((357, 93)) npa2 = np.random.random((31, 357)) result = np.dot(np.reshape(npa1, (1071, 31)), npa2) t1 = expr.from_numpy(npa1) t2 = expr.from_numpy(npa2) t3 = expr.dot(expr.reshape(t1, (1071, 31)), t2) Assert.all_eq(result, t3.glom(), 10e-9) npa1 = np.random.random((357, 718)) npa2 = np.random.random((718, )) result = np.dot(npa1, np.reshape(npa2, (718, 1))) t1 = expr.from_numpy(npa1) t2 = expr.from_numpy(npa2) t3 = expr.dot(t1, expr.reshape(t2, (718, 1))) Assert.all_eq(result, t3.glom(), 10e-9) npa1 = np.random.random((718, )) npa2 = np.random.random((1, 357)) result = np.dot(np.reshape(npa1, (718, 1)), npa2) t1 = expr.from_numpy(npa1) t2 = expr.from_numpy(npa2) t3 = expr.dot(expr.reshape(t1, (718, 1)), t2) Assert.all_eq(result, t3.glom(), 10e-9)
def test_optimization_reduced(self): na = np.random.rand(1000, 1000) nb = np.random.rand(1000, 1000) a = expr.from_numpy(na) b = expr.from_numpy(nb) c = a - b d = a + c f = c[200:900, 200:900] g = d[200:900, 200:900] h = f - g i = f + h j = h[100:500, 100:500] k = i[100:500, 100:500] l = expr.dot(j, k) m = j + k n = k - l o = n - m q = n + o r = q - m s = expr.sum(r) nc = na - nb nd = na + nc nf = nc[200:900, 200:900] ng = nd[200:900, 200:900] nh = nf - ng ni = nf + nh nj = nh[100:500, 100:500] nk = ni[100:500, 100:500] nl = np.dot(nj, nk) nm = nj + nk nn = nk - nl no = nn - nm nq = nn + no nr = nq - nm ns = np.sum(nr) # Our sum seems to reduce precision Assert.all_eq(ns, s.optimized().glom(), tolerance = 1e-6)
def test_optimization_reduced(self): na = np.random.rand(1000, 1000) nb = np.random.rand(1000, 1000) a = expr.from_numpy(na) b = expr.from_numpy(nb) c = a - b d = a + c f = c[200:900, 200:900] g = d[200:900, 200:900] h = f - g i = f + h j = h[100:500, 100:500] k = i[100:500, 100:500] l = expr.dot(j, k) m = j + k n = k - l o = n - m q = n + o r = q - m s = expr.sum(r) nc = na - nb nd = na + nc nf = nc[200:900, 200:900] ng = nd[200:900, 200:900] nh = nf - ng ni = nf + nh nj = nh[100:500, 100:500] nk = ni[100:500, 100:500] nl = np.dot(nj, nk) nm = nj + nk nn = nk - nl no = nn - nm nq = nn + no nr = nq - nm ns = np.sum(nr) # Our sum seems to reduce precision Assert.all_eq(ns, s.optimized().glom(), tolerance=1e-6)
def benchmark_qr(ctx, timer): M = 1280 N = 1280 Y = np.random.randn(M, N) Y = expr.from_numpy(Y) #Y = expr.randn(M, N) t1 = datetime.now() Q, R = qr(Y) t2 = datetime.now() cost_time = millis(t1, t2) print "total cost time:%s ms" % (cost_time)
def test_optimization_shape(self): shape = (200, 800) na = np.arange(np.prod(shape), dtype=np.int).reshape(shape) nb = np.random.randint(1, 1000, (1000, 1000)) nc = np.random.randint(1, 1000, (1000, 1000)) a = expr.arange(shape, dtype=np.int) b = expr.from_numpy(nb) c = expr.from_numpy(nc) d = b + c e = b + d f = d[200:900, 200:900] g = e[200:900, 200:900] h = f + g i = f + h j = h[100:500, 100:500] k = i[100:300, 100:300] l = expr.reshape(expr.ravel(j), (800, 200)) m = expr.dot(a, l) n = m + k o = n + m q = o[100:200, 100:200] nd = nb + nc ne = nb + nd nf = nd[200:900, 200:900] ng = ne[200:900, 200:900] nh = nf + ng ni = nf + nh nj = nh[100:500, 100:500] nk = ni[100:300, 100:300] nl = np.reshape(np.ravel(nj), (800, 200)) nm = np.dot(na, nl) nn = nm + nk no = nn + nm nq = no[100:200, 100:200] Assert.all_eq(nq, q.optimized().glom(), tolerance = 1e-10)
def test_ndimension(self): for case in xrange(5): dim = np.random.randint(low=2, high=6) shape = np.random.randint(low=5, high=11, size=dim) util.log_info('Test Case #%s: DIM(%s) shape%s', case + 1, dim, shape) na = new_ndarray(shape) a = expr.from_numpy(na) for axis in xrange(dim): Assert.all_eq(expr.sort(a, axis).glom(), np.sort(na, axis)) Assert.all_eq(expr.argsort(a, axis).glom(), np.argsort(na, axis))
def test_combo(self): na = np.arange(100).reshape(10, 10) a = expr.from_numpy(na) Assert.all_eq(na[np.newaxis, 2:7, 4], a[expr.newaxis, 2:7, 4].glom()) Assert.all_eq(na[2:7, np.newaxis, -1], a[2:7, expr.newaxis, -1].glom()) Assert.all_eq(na[-1, np.newaxis, 2:7], a[-1, expr.newaxis, 2:7].glom()) Assert.all_eq(na[np.newaxis, 2:7, np.newaxis, np.newaxis, 4, np.newaxis, np.newaxis], a[expr.newaxis, 2:7, expr.newaxis, expr.newaxis, 4, expr.newaxis, expr.newaxis].glom()) util.log_info('\na.shape: %s \nna.shape: %s', a[expr.newaxis, 2:7, expr.newaxis, expr.newaxis, -1, expr.newaxis, expr.newaxis].shape, na[np.newaxis, 2:7, np.newaxis, np.newaxis, -1, np.newaxis, np.newaxis].shape)
def test_combo(self): na = np.arange(100).reshape(10, 10) a = expr.from_numpy(na) Assert.all_eq(na[np.newaxis, 2:7, 4], a[expr.newaxis, 2:7, 4].glom()) Assert.all_eq(na[2:7, np.newaxis, -1], a[2:7, expr.newaxis, -1].glom()) Assert.all_eq(na[-1, np.newaxis, 2:7], a[-1, expr.newaxis, 2:7].glom()) Assert.all_eq( na[np.newaxis, 2:7, np.newaxis, np.newaxis, 4, np.newaxis, np.newaxis], a[expr.newaxis, 2:7, expr.newaxis, expr.newaxis, 4, expr.newaxis, expr.newaxis].glom()) util.log_info( '\na.shape: %s \nna.shape: %s', a[expr.newaxis, 2:7, expr.newaxis, expr.newaxis, -1, expr.newaxis, expr.newaxis].shape, na[np.newaxis, 2:7, np.newaxis, np.newaxis, -1, np.newaxis, np.newaxis].shape)
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'): ''' clustering data points using kmeans spectral clustering method. Args: points(Expr or DistArray): the data points to be clustered. k(int): the number of clusters we need to generate. num_iter(int): the max number of iterations that kmeans clustering method runs. similarity_measurement(str): distance method used to measure similarity between two points. ''' # calculate similarity for each pair of points to generate the adjacency matrix A A = expr.shuffle(points, _row_similarity_mapper, kw={'similarity_measurement': similarity_measurement}, shape_hint=(points.shape[0], points.shape[0])) num_dims = A.shape[1] # Construct the diagonal matrix D D = expr.sum(A, axis=1, tile_hint=(A.shape[0], )) # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5) L = expr.shuffle(A, _laplacian_mapper, kw={'D': D}, shape_hint=A.shape) # Perform eigen-decomposition using Lanczos solver overshoot = min(k * 2, num_dims) d, U = lanczos.solve(L, L, overshoot, True) U = U[:, 0:k] # Generate initial clusters which picks rows as centers if that row contains max eigen # value in that column init_clusters = U[np.argmax(U, axis=0)] # Run kmeans clustering with init_clusters kmeans = KMeans(k, num_iter) U = expr.from_numpy(U) centers, labels = kmeans.fit(U, init_clusters) return labels
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'): ''' clustering data points using kmeans spectral clustering method. Args: points(Expr or DistArray): the data points to be clustered. k(int): the number of clusters we need to generate. num_iter(int): the max number of iterations that kmeans clustering method runs. similarity_measurement(str): distance method used to measure similarity between two points. ''' # calculate similarity for each pair of points to generate the adjacency matrix A A = expr.shuffle(points, _row_similarity_mapper, kw={'similarity_measurement': similarity_measurement}) num_dims = A.shape[1] # Construct the diagonal matrix D D = expr.sum(A, axis=1, tile_hint=(A.shape[0],)) # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5) L = expr.shuffle(A, _laplacian_mapper, kw={'D': D}) # Perform eigen-decomposition using Lanczos solver overshoot = min(k * 2, num_dims) d, U = lanczos.solve(L, L, overshoot, True) U = U[:, 0:k] # Generate initial clusters which picks rows as centers if that row contains max eigen # value in that column init_clusters = U[np.argmax(U, axis=0)] # Run kmeans clustering with init_clusters kmeans = KMeans(k, num_iter) U = expr.from_numpy(U) centers, labels = kmeans.fit(U, init_clusters) return labels
def test_assign_expr(self): # Small matrix a = np.random.randn(20, 10) b = np.random.randn(10) region_a = np.s_[10, ] sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom() a[region_a] = b Assert.all_eq(sp_a, a) # Larger matrix a = np.random.randn(200, 100) b = np.random.randn(100) region_a = np.s_[50, ] sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom() a[region_a] = b Assert.all_eq(sp_a, a) # Worst case region a = np.random.randn(200, 100) b = np.random.randn(3, 50) region_a = np.s_[99:102, 25:75] sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom() a[region_a] = b Assert.all_eq(sp_a, a)
def solve(A, AT, desired_rank, is_symmetric=False): ''' A simple implementation of the Lanczos algorithm (http://en.wikipedia.org/wiki/Lanczos_algorithm) for eigenvalue computation. Like the Mahout implementation, only the matrix*vector step is parallelized. First we use lanczos method to turn the matrix into tridiagonoal form. Then we use numpy.linalg.eig function to extract the eigenvalues and eigenvectors from the tridiagnonal matrix(desired_rank*desired_rank). Since desired_rank should be smaller than the size of matrix, so we could it in local machine efficiently. ''' # Calculate two more eigenvalues, but we only keep the largest desired_rank # one. Doing this to keep the result consistent with scipy.sparse.linalg.svds. desired_rank += 2 n = A.shape[1] v_next = np.ones(n) / np.sqrt(n) v_prev = np.zeros(n) beta = np.zeros(desired_rank+1) beta[0] = 0 alpha = np.zeros(desired_rank) # Since the disiredRank << size of matrix, so we keep # V in local memory for efficiency reason(It needs to be updated # for every iteration). # If the case which V can't be fit in local memory occurs, # you could turn it into spartan distributed array. V = np.zeros((n, desired_rank)) for i in range(0, desired_rank): util.log_info("Iter : %s", i) v_next_expr = expr.from_numpy(v_next.reshape(n, 1)) if is_symmetric: w = expr.dot(A, v_next_expr).optimized().glom().reshape(n) else: w = expr.dot(A, v_next_expr) w = expr.dot(AT, w).optimized().glom().reshape(n) alpha[i] = np.dot(w, v_next) w = w - alpha[i] * v_next - beta[i] * v_prev # Orthogonalize: for t in range(i): tmpa = np.dot(w, V[:, t]) if tmpa == 0.0: continue w -= tmpa * V[:, t] beta[i+1] = np.linalg.norm(w, 2) v_prev = v_next v_next = w / beta[i+1] V[:, i] = v_prev # Create tridiag matrix with size (desired_rank X desired_rank) tridiag = np.diag(alpha) for i in range(0, desired_rank-1): tridiag[i, i+1] = beta[i+1] tridiag[i+1, i] = beta[i+1] # Get eigenvectors and eigenvalues of this tridiagonal matrix. # The eigenvalues of this tridiagnoal matrix equals to the eigenvalues # of matrix dot(A, A.T.). We can get the eigenvectors of dot(A, A.T) # by multiplying V with eigenvectors of this tridiagonal matrix. d, v = np.linalg.eig(tridiag) # Sort eigenvalues and their corresponding eigenvectors sorted_idx = np.argsort(np.absolute(d))[::-1] d = d[sorted_idx] v = v[:, sorted_idx] # Get the eigenvetors of dot(A, A.T) s = np.dot(V, v) return d[0:desired_rank-2], s[:, 0:desired_rank-2]
def fit(self, X): if isinstance(X, np.ndarray): X = expr.from_numpy(X) self.X = X return self
def kneighbors(self, X, n_neighbors=None): """Finds the K-neighbors of a point. Returns distance Parameters ---------- X : array-like, last dimension same as that of fit data The new point. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). Returns ------- dist : array Array representing the lengths to point, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. """ if n_neighbors is not None: self.n_neighbors = n_neighbors if isinstance(X, np.ndarray): X = expr.from_numpy(X) if self.algorithm in ('auto', 'brute'): X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) fit_X_broadcast = expr.reshape( self.X, (1, self.X.shape[0], self.X.shape[1])) distances = expr.sum((X_broadcast - fit_X_broadcast)**2, axis=2) neigh_ind = expr.argsort(distances, axis=1) neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom() neigh_dist = expr.sort(distances, axis=1) neigh_dist = expr.sqrt( neigh_dist[:, :n_neighbors]).optimized().glom() return neigh_dist, neigh_ind else: results = self.X.foreach_tile(mapper_fn=_knn_mapper, kw={ 'X': self.X, 'Q': X, 'n_neighbors': self.n_neighbors, 'algorithm': self.algorithm }) dist = None ind = None """ Get the KNN candidates for each tile of X, then find out the real KNN """ for k, v in results.iteritems(): if dist is None: dist = v[0] ind = v[1] else: dist = np.concatenate((dist, v[0]), axis=1) ind = np.concatenate((ind, v[1]), axis=1) mask = np.argsort(dist, axis=1)[:, :self.n_neighbors] new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)]) new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)]) return new_dist, new_ind
def fit(self, X, centers=None, implementation='map2'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0], )) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum( X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={ 'd_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels }, shape_hint=(1, ), cost_hint={ hash(labels): { '00': 0, '01': np.prod(labels.shape) } }) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn( n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def fit(self, X, centers=None, implementation='outer'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0],)) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={'d_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels}, shape_hint=(1,), cost_hint={hash(labels): {'00': 0, '01': np.prod(labels.shape)}}) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels