def benchmark_linear_regression(ctx, timer): N_EXAMPLES = 10 * 1000 * 1000 * ctx.num_workers N_DIM = 10 x = expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / ctx.num_workers, N_DIM)).astype(np.float32) y = expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / ctx.num_workers, 1)).astype(np.float32) w = np.random.rand(N_DIM, 1).astype(np.float32) x = expr.eager(x) y = expr.eager(y) def _step(): yp = expr.dot(x, w) Assert.all_eq(yp.shape, y.shape) diff = x * (yp - y) grad = expr.sum(diff, axis=0).glom().reshape((N_DIM, 1)) wprime = w - grad * 1e-6 wprime.evaluate() for i in range(25): timer.time_op('linear-regression', _step)
def benchmark_linear_regression(ctx, timer): N_EXAMPLES = 10 * 1000 * 1000 * ctx.num_workers N_DIM = 10 x = expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / ctx.num_workers, N_DIM)).astype(np.float32) y = expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / ctx.num_workers, 1)).astype(np.float32) w = np.random.rand(N_DIM, 1).astype(np.float32) x = expr.eager(x) y = expr.eager(y) def _step(): yp = expr.dot(x, w) Assert.all_eq(yp.shape, y.shape) diff = x * (yp - y) grad = expr.sum(diff, axis=0).glom().reshape((N_DIM, 1)) wprime = w - grad * 1e-6 expr.force(wprime) for i in range(25): timer.time_op('linear-regression', _step)
def test_knn(self): ctx = spartan.blob_ctx.get() N_QUERY = ctx.num_workers * 2 N_DIM = ctx.num_workers * 2 X = expr.rand(N_SAMPLES, N_DIM) Y = expr.rand(N_QUERY, N_DIM) #dist, ind = SKNN().fit(X).kneighbors(Y) dist2, ind2 = NearestNeighbors().fit(X).kneighbors(Y)
def benchmark_lreg(ctx, timer): print "#worker:", ctx.num_workers FLAGS.opt_parakeet_gen = 0 N_EXAMPLES = 4000000 * ctx.num_workers #N_EXAMPLES = 5000000 * 64 x = expr.rand(N_EXAMPLES, N_DIM) y = expr.rand(N_EXAMPLES, 1) start = time.time() linear_regression.linear_regression(x, y, ITERATION) total = time.time() - start util.log_warn("time cost : %s s" % (total*1.0/ITERATION,))
def benchmark_ridgereg(ctx, timer): print "#worker:", ctx.num_workers #N_EXAMPLES = 100000000 * ctx.num_workers N_EXAMPLES = 90000000 * ctx.num_workers x = expr.rand(N_EXAMPLES, N_DIM) y = expr.rand(N_EXAMPLES, 1) start = time.time() ridge_regression.ridge_regression(x, y, 1, ITERATION) total = time.time() - start util.log_warn("time cost : %s s" % (total * 1.0 / ITERATION, ))
def benchmark_logreg(ctx, timer): print "#worker:", ctx.num_workers #N_EXAMPLES = 40000000 * ctx.num_workers N_EXAMPLES = 5000000 * 64 x = expr.eager(expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / ctx.num_workers, N_DIM))) y = expr.eager(expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / ctx.num_workers, 1))) start = time.time() logistic_regression.logistic_regression(x, y, ITERATION) total = time.time() - start util.log_warn("time cost : %s s" % (total*1.0/ITERATION,))
def benchmark_ridgereg(ctx, timer): print "#worker:", ctx.num_workers #N_EXAMPLES = 100000000 * ctx.num_workers N_EXAMPLES = 90000000 * ctx.num_workers x = expr.rand(N_EXAMPLES, N_DIM) y = expr.rand(N_EXAMPLES, 1) start = time.time() ridge_regression.ridge_regression(x, y, 1, ITERATION) total = time.time() - start util.log_warn("time cost : %s s" % (total*1.0/ITERATION,))
def benchmark_knn(ctx, timer): print "#worker:", ctx.num_workers N_SAMPLES = ctx.num_workers * 300 N_QUERY = ctx.num_workers * 2 N_DIM = ctx.num_workers * 2 X = expr.rand(N_SAMPLES, N_DIM) Y = expr.rand(N_QUERY, N_DIM) t1 = datetime.now() dist2, ind2 = NearestNeighbors().fit(X).kneighbors(Y) t2 = datetime.now() cost_time = millis(t1, t2) print "total cost time:%s ms" % (cost_time)
def random_galaxy(n): '''Generate a galaxy of random bodies.''' dtype = np.float # consistent with sp.rand, same as np.float64 galaxy = { # All bodies stand still initially. 'm': (rand(n) + dtype(10)) * dtype(m_sol / 10), 'x': (rand(n) - dtype(0.5)) * dtype(r_ly / 100), 'y': (rand(n) - dtype(0.5)) * dtype(r_ly / 100), 'z': (rand(n) - dtype(0.5)) * dtype(r_ly / 100), 'vx': zeros((n, )), 'vy': zeros((n, )), 'vz': zeros((n, )) } return galaxy
def random_galaxy(n): '''Generate a galaxy of random bodies.''' dtype = np.float # consistent with sp.rand, same as np.float64 galaxy = { # All bodies stand still initially. 'm': (rand(n) + dtype(10)) * dtype(m_sol/10), 'x': (rand(n) - dtype(0.5)) * dtype(r_ly/100), 'y': (rand(n) - dtype(0.5)) * dtype(r_ly/100), 'z': (rand(n) - dtype(0.5)) * dtype(r_ly/100), 'vx': zeros((n, )), 'vy': zeros((n, )), 'vz': zeros((n, )) } return galaxy
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = points.evaluate() num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) #labels = expr.zeros((points.shape[0],), dtype=np.int) for iter in range(num_iter): centers = centers.glom() fuzzy = expr.map2(points, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers, "m": m}, shape=(points.shape[0], centers.shape[0])) labels = expr.argmax(fuzzy, axis=1) new_centers = expr.map2((points, fuzzy), (0, 0), fn=kmeans_map2_center_mapper, fn_kw={"centers": centers, "m": m}, shape=(centers.shape[0], centers.shape[1]), reducer=np.add) new_centers /= expr.sum(fuzzy ** m, axis=0)[:, expr.newaxis] centers = new_centers return labels
def benchmark_cg(ctx, timer): print "#worker:", ctx.num_workers l = int(math.sqrt(ctx.num_workers)) n = 2000 * 16 #n = 4000 * l la = 20 niter = 5 tile_hint = (n, n/ctx.num_workers) #nonzer = 7 #nz = n * (nonzer + 1) * (nonzer + 1) + n * (nonzer + 2) #density = 0.5 * nz/(n*n) A = expr.rand(n, n, tile_hint=tile_hint) A = (A + expr.transpose(A))*0.5 I = expr.sparse_diagonal((n,n), tile_hint=tile_hint) * la I.force() A = expr.eager(A - I) #x1 = numpy_cg(A.glom(), niter) util.log_warn('begin cg!') t1 = datetime.now() x2 = conj_gradient(A, niter).force() t2 = datetime.now() cost_time = millis(t1,t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/niter)
def test_kmeans_expr(self): ctx = spartan.blob_ctx.get() pts = expr.rand(N_PTS, N_DIM, tile_hint=(divup(N_PTS, ctx.num_workers), N_DIM)).force() k = KMeans(N_CENTERS, ITER) k.fit(pts)
def benchmark_cg(ctx, timer): print "#worker:", ctx.num_workers l = int(math.sqrt(ctx.num_workers)) #n = 2000 * 16 n = 500 * ctx.num_workers la = 20 niter = 5 #nonzer = 7 #nz = n * (nonzer + 1) * (nonzer + 1) + n * (nonzer + 2) #density = 0.5 * nz/(n*n) A = expr.rand(n, n) A = (A + expr.transpose(A)) * 0.5 I = expr.sparse_diagonal((n, n)) * la A = A - I #x1 = numpy_cg(A.glom(), niter) util.log_warn('begin cg!') t1 = datetime.now() x2 = conj_gradient(A, niter).force() t2 = datetime.now() cost_time = millis(t1, t2) print "total cost time:%s ms, per iter cost time:%s ms" % ( cost_time, cost_time / niter)
def test_matrix_mult(self): _skip_if_travis() N_POINTS = 2000 x = expr.rand(N_POINTS, N_POINTS, tile_hint=(N_POINTS, N_POINTS/ self.ctx.num_workers)).astype(np.float32) y = expr.rand(N_POINTS, N_POINTS, tile_hint=(N_POINTS / self.ctx.num_workers, N_POINTS)).astype(np.float32) x = expr.eager(x) y = expr.eager(y) start = time.time() for i in range(5): res = expr.dot(x, y, tile_hint=(N_POINTS, N_POINTS/ self.ctx.num_workers)) res.force() cost = time.time() - start self._verify_cost("matrix_mult", cost)
def test_matrix_mult(self): _skip_if_travis() N_POINTS = 2000 x = expr.rand(N_POINTS, N_POINTS, tile_hint=(N_POINTS, N_POINTS / self.ctx.num_workers)).astype(np.float32) y = expr.rand(N_POINTS, N_POINTS, tile_hint=(N_POINTS / self.ctx.num_workers, N_POINTS)).astype(np.float32) x = expr.eager(x) y = expr.eager(y) start = time.time() for i in range(5): res = expr.dot(x, y, tile_hint=(N_POINTS, N_POINTS / self.ctx.num_workers)) res.evaluate() cost = time.time() - start self._verify_cost("matrix_mult", cost)
def learn_topics(terms_docs_matrix, k_topics, alpha=0.1, eta=0.1, max_iter=10, max_iter_per_doc=1): """ Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model. Args: terms_docs_matrix(Expr or DistArray): the count of each term in each document. k_topics: the number of topics we need to find. alpha(float): parameter of LDA model. eta(float): parameter of LDA model. max_iter(int):the max iterations to train LDA topic model. max_iter_per_doc: the max iterations to train each document. """ num_terms = terms_docs_matrix.shape[0] num_docs = terms_docs_matrix.shape[1] topic_term_counts = expr.rand(k_topics, num_terms) for i in range(max_iter): # topic_term_counts = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)), # _lda_mapper, # target=expr.ndarray((k_topics, num_terms), dtype=np.float64, reduce_fn=np.add), # kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc, #'topic_term_counts': topic_term_counts}).optimized() topic_term_counts = expr.outer( (terms_docs_matrix, topic_term_counts), (1, None), fn=_lda_mapper, fn_kw={"k_topics": k_topics, "alpha": alpha, "eta": eta, "max_iter_per_doc": max_iter_per_doc}, shape=(k_topics, num_terms), dtype=np.float64, reducer=np.add, ) # calculate the doc-topic inference # doc_topics = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)), # _lda_doc_topic_mapper, # kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc, #'topic_term_counts': topic_term_counts}, # shape_hint=(num_docs, k_topics)).optimized() doc_topics = expr.outer( (terms_docs_matrix, topic_term_counts), (1, None), fn=_lda_doc_topic_mapper, fn_kw={"k_topics": k_topics, "alpha": alpha, "eta": eta, "max_iter_per_doc": max_iter_per_doc}, shape=(num_docs, k_topics), dtype=np.float64, ) # normalize the topic-term distribution norm_val = expr.reduce( topic_term_counts, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: np.abs(data).sum(axis), accumulate_fn=np.add, ) topic_term_counts = topic_term_counts / norm_val.reshape((k_topics, 1)) topic_term_counts = topic_term_counts.optimized() return doc_topics, topic_term_counts
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) labels = expr.zeros((points.shape[0],), dtype=np.int) for iter in range(num_iter): centers = expr.as_array(centers) points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2) # This is used to avoid dividing zero distances = distances + 0.00000000001 util.log_info('distances shape %s' % str(distances.shape)) distances_broadcast = expr.reshape(distances, (distances.shape[0], 1, distances.shape[1])) distances_broadcast2 = expr.reshape(distances, (distances.shape[0], distances.shape[1], 1)) prob = 1.0 / expr.sum(expr.power(distances_broadcast / distances_broadcast2, 2.0 / (m - 1)), axis=2) prob.force() counts = expr.sum(prob, axis=0) counts = expr.reshape(counts, (counts.shape[0], 1)) labels = expr.argmax(prob, axis=1) centers = expr.sum(expr.reshape(points, (points.shape[0], 1, points.shape[1])) * expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)), axis=0) # We assume that the size of centers are relative small that can be handled # on the master. counts = counts.glom() centers = centers.glom() # If any centroids don't have any points assigned to them. zcount_indices = (counts == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. counts[zcount_indices, :] = 1 centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices), num_dim) centers = centers / counts return labels
def benchmark_canopy_clustering(ctx, timer): # N_PTS = 60000 * ctx.num_workers N_PTS = 30000 * 64 N_DIM = 2 pts = expr.rand(N_PTS, N_DIM, tile_hint=(N_PTS / ctx.num_workers, N_DIM)).evaluate() t1 = datetime.now() cluster_result = canopy_cluster(pts).evaluate() t2 = datetime.now() print "canopy_cluster time:%s ms" % millis(t1, t2)
def benchmark_linear_regression(ctx, timer): N_EXAMPLES = 65536 N_DIM = 1 x = expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / N_TILES, N_DIM)).astype(np.float32) x = expr.eager(x) def _step(): y = expr.force(x * x) for i in range(25): _step()
def benchmark_canopy_clustering(ctx, timer): #N_PTS = 60000 * ctx.num_workers N_PTS = 30000 * 64 N_DIM = 2 pts = expr.rand(N_PTS, N_DIM, tile_hint=(N_PTS / ctx.num_workers, N_DIM)).force() t1 = datetime.now() cluster_result = canopy_cluster(pts).force() t2 = datetime.now() print 'canopy_cluster time:%s ms' % millis(t1, t2)
def benchmark_kmeans(ctx, timer): print "#worker:", ctx.num_workers N_PTS = 1000 * 256 N_CENTERS = 10 N_DIM = 512 ITER = 1 pts = expr.rand(N_PTS, N_DIM) k = KMeans(N_CENTERS, ITER) t1 = datetime.now() k.fit(pts) t2 = datetime.now() cost_time = millis(t1, t2) print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/ITER)
def benchmark_fuzzy_kmeans(ctx, timer): #N_PTS = 40000 * ctx.num_workers N_PTS = 1000 * 256 N_DIM = 512 ITER = 5 N_CENTERS = 10 pts = expr.rand(N_PTS, N_DIM) t1 = datetime.now() cluster_result = fuzzy_kmeans(pts, k=N_CENTERS, num_iter=ITER).evaluate() t2 = datetime.now() time_cost = millis(t1, t2) print 'fuzzy_cluster time:%s ms, per_iter:%s ms' % (time_cost, time_cost/ITER)
def benchmark_spectral_clustering(ctx, timer): #N_PTS = 500 * ctx.num_workers N_PTS = 50 * 64 N_DIM = 2 ITER = 5 N_CENTERS = 5 pts = expr.rand(N_PTS, N_DIM, tile_hint=(N_PTS / ctx.num_workers, N_DIM)).force() t1 = datetime.now() cluster_result = spectral_cluster(pts, N_CENTERS, ITER).glom() t2 = datetime.now() print 'spectral_cluster time:%s ms' % millis(t1, t2)
def benchmark_spectral_clustering(ctx, timer): #N_PTS = 500 * ctx.num_workers N_PTS = 50 * 64 N_DIM = 2 ITER = 5 N_CENTERS = 5 pts = expr.rand(N_PTS, N_DIM, tile_hint=(N_PTS / ctx.num_workers, N_DIM)).evaluate() t1 = datetime.now() cluster_result = spectral_cluster(pts, N_CENTERS, ITER).glom() t2 = datetime.now() print 'spectral_cluster time:%s ms' % millis(t1, t2)
def benchmark_kmeans(ctx, timer): print "#worker:", ctx.num_workers N_PTS = 1000 * 256 N_CENTERS = 10 N_DIM = 512 ITER = 1 pts = expr.rand(N_PTS, N_DIM) k = KMeans(N_CENTERS, ITER) t1 = datetime.now() k.fit(pts) t2 = datetime.now() cost_time = millis(t1, t2) print "total cost time:%s ms, per iter cost time:%s ms" % ( cost_time, cost_time / ITER)
def benchmark_fuzzy_kmeans(ctx, timer): # N_PTS = 40000 * ctx.num_workers N_PTS = 1000 * 256 N_DIM = 512 ITER = 5 N_CENTERS = 10 pts = expr.rand(N_PTS, N_DIM) t1 = datetime.now() cluster_result = fuzzy_kmeans(pts, k=N_CENTERS, num_iter=ITER).evaluate() t2 = datetime.now() time_cost = millis(t1, t2) print "fuzzy_cluster time:%s ms, per_iter:%s ms" % (time_cost, time_cost / ITER)
def test_linear_reg(self): _skip_if_travis() N_EXAMPLES = 10 * 1000 * 1000 * self.ctx.num_workers N_DIM = 10 x = expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / self.ctx.num_workers, N_DIM)).astype(np.float32) y = expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / self.ctx.num_workers, 1)).astype(np.float32) w = np.random.rand(N_DIM, 1).astype(np.float32) x = expr.eager(x) y = expr.eager(y) start = time.time() for i in range(5): yp = expr.dot(x, w) diff = x * (yp - y) grad = expr.sum(diff, axis=0, tile_hint=[N_DIM]).glom().reshape((N_DIM, 1)) w = w - grad * 1e-6 cost = time.time() - start self._verify_cost("linear_reg", cost)
def test_kmeans(self): _skip_if_travis() N_PTS = 1000 * 1000 * self.ctx.num_workers ITER = 5 N_DIM = 10 N_CENTERS = 10 start = time.time() pts = expr.rand(N_PTS, N_DIM).evaluate() k = KMeans(N_CENTERS, ITER) k.fit(pts) cost = time.time() - start self._verify_cost("kmeans", cost)
def benchmark_fuzzy_kmeans(ctx, timer): #N_PTS = 40000 * ctx.num_workers N_PTS = 20000 * 64 N_DIM = 2 ITER = 5 N_CENTERS = 10 pts = expr.rand(N_PTS, N_DIM, tile_hint=(N_PTS / ctx.num_workers, N_DIM)).force() t1 = datetime.now() cluster_result = fuzzy_kmeans(pts, k=N_CENTERS, num_iter=ITER).force() t2 = datetime.now() time_cost = millis(t1, t2) print 'fuzzy_cluster time:%s ms, per_iter:%s ms' % (time_cost, time_cost/ITER)
def test_kmeans(self): _skip_if_travis() N_PTS = 1000 * 1000 * self.ctx.num_workers ITER = 5 N_DIM = 10 N_CENTERS = 10 start = time.time() pts = expr.rand(N_PTS, N_DIM).force() k = KMeans(N_CENTERS, ITER) k.fit(pts) cost = time.time() - start self._verify_cost("kmeans", cost)
def benchmark_streaming_kmeans(ctx, timer): #N_PTS = 100 * ctx.num_workers N_PTS = 100 * 64 N_DIM = 2 N_CENTERS = 5 pts = expr.rand(N_PTS, N_DIM, tile_hint=(N_PTS / ctx.num_workers, N_DIM)).force() print pts.glom() t1 = datetime.now() cluster_result = streaming_kmeans(pts, k=N_CENTERS).glom() t2 = datetime.now() #print cluster_result.glom() time_cost = millis(t1, t2) print 'streaming_kmeans_cluster time:%s ms' % time_cost
def benchmark_streaming_kmeans(ctx, timer): #N_PTS = 100 * ctx.num_workers N_PTS = 100 * 64 N_DIM = 2 N_CENTERS = 5 pts = expr.rand(N_PTS, N_DIM, tile_hint=(N_PTS / ctx.num_workers, N_DIM)).evaluate() print pts.glom() t1 = datetime.now() cluster_result = streaming_kmeans(pts, k=N_CENTERS).glom() t2 = datetime.now() #print cluster_result.glom() time_cost = millis(t1, t2) print 'streaming_kmeans_cluster time:%s ms' % time_cost
def benchmark_pr(ctx, timer): num_pages = 300 * 1000 * 3 * ctx.num_workers num_outlinks = 10 density = num_outlinks * 1.0 / num_pages same_site_prob = 0.9 print "#worker:", ctx.num_workers col_step = util.divup(num_pages, ctx.num_workers) wts_tile_hint = [num_pages, col_step] p_tile_hint = [col_step, 1] #wts = expr.sparse_diagonal((num_pages, num_pages), dtype=np.float32, tile_hint=wts_tile_hint) #wts = expr.eager( # expr.sparse_rand((num_pages, num_pages), # density=density, # format='csr', # dtype=np.float32, # tile_hint=wts_tile_hint)) wts = pagerank_sparse(num_pages, num_outlinks, same_site_prob) #res = wts.glom().todense() #for i in range(res.shape[0]): # l = [] # for j in range(res.shape[1]): # l.append(round(res[i,j],1)) # print l #p = expr.sparse_empty((num_pages,1), dtype=np.float32, tile_hint=p_tile_hint).evaluate() #for i in range(num_pages): # p[i,0] = 1 #p = expr.sparse_rand((num_pages, 1), density=1.0, format='csc', dtype=np.float32, tile_hint=p_tile_hint) p = expr.rand(num_pages, 1).astype(np.float32) #q = expr.zeros((num_pages, 1), dtype=np.float32, tile_hint=p_tile_hint).evaluate() #q[:] = p.glom().todense() #q = expr.lazify(q) #r = expr.dot(wts, p) #print r.glom() t1 = datetime.now() sparse_multiply(wts, p, p_tile_hint) t2 = datetime.now() cost_time = millis(t1, t2) print 'current benchmark:', cost_time / num_iter / 1000
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10, M=None): ''' compute the factorization A = U M' using the alternating least-squares (ALS) method. where `A` is the "ratings" matrix which maps from a user and item to a rating score, `U` and `M` are the factor matrices, which represent user and item preferences. Args: A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score. la(float): the parameter of the als. alpha(int): confidence parameter used on implicit feedback. implicit_feedback(bool): whether using implicit_feedback method for als. num_features(int): dimension of the feature space. num_iter(int): max iteration to run. ''' num_users = A.shape[0] num_items = A.shape[1] AT = expr.transpose(A) avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0) M = expr.rand(num_items, num_features) M = expr.assign(M, np.s_[:, 0], avg_rating.reshape((avg_rating.shape[0], 1))) #A = expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0)) #AT = expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0)) for i in range(num_iter): # Recomputing U shape = (num_users, num_features) U = expr.outer((A, M), (0, None), fn=_solve_U_or_M_mapper, fn_kw={'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback, 'shape': shape}, shape=shape, dtype=np.float) # Recomputing M shape = (num_items, num_features) M = expr.outer((AT, U), (0, None), fn=_solve_U_or_M_mapper, fn_kw={'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback, 'shape': shape}, shape=shape, dtype=np.float) return U, M
def learn_topics(terms_docs_matrix, k_topics, alpha=0.1, eta=0.1, max_iter=10, max_iter_per_doc=1): ''' Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model. Args: terms_docs_matrix(Expr or DistArray): the count of each term in each document. k_topics: the number of topics we need to find. alpha(float): parameter of LDA model. eta(float): parameter of LDA model. max_iter(int):the max iterations to train LDA topic model. max_iter_per_doc: the max iterations to train each document. ''' topic_term_counts = expr.rand(k_topics, terms_docs_matrix.shape[0], tile_hint=(k_topics, terms_docs_matrix.shape[0])) for i in range(max_iter): new_topic_term_counts = expr.ndarray((k_topics, terms_docs_matrix.shape[0]), dtype=np.float64, reduce_fn=np.add, tile_hint=(k_topics, terms_docs_matrix.shape[0])) topic_term_counts = expr.shuffle(terms_docs_matrix, _lda_mapper, target=new_topic_term_counts, kw={'k_topics': k_topics, 'alpha': alpha, 'eta':eta, 'max_iter_per_doc': max_iter_per_doc, 'topic_term_counts': topic_term_counts}) # calculate the doc-topic inference doc_topics = expr.shuffle(terms_docs_matrix, _lda_doc_topic_mapper, kw={'k_topics': k_topics, 'alpha': alpha, 'eta':eta, 'max_iter_per_doc': max_iter_per_doc, 'topic_term_counts': topic_term_counts}) # normalize the topic-term distribution norm_val = expr.reduce(topic_term_counts, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis:np.abs(data).sum(axis), accumulate_fn=np.add) topic_term_counts = topic_term_counts / norm_val.reshape((topic_term_counts.shape[0], 1)) return doc_topics, topic_term_counts
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim, tile_hint=(k, num_dim)) labels = expr.zeros((points.shape[0],), dtype=np.int, tile_hint=(points.shape[0]/len(points.tiles),)) for iter in range(num_iter): new_centers = expr.ndarray((k, num_dim), reduce_fn=lambda a, b: a + b, tile_hint=(k, num_dim)) new_counts = expr.ndarray((k, 1), dtype=np.float, reduce_fn=lambda a, b: a + b, tile_hint=(k, 1)) expr.shuffle(points, _fuzzy_kmeans_mapper, kw={'old_centers': centers, 'centers': new_centers, 'counts': new_counts, 'labels': labels, 'm': m}).force() # If any centroids don't have any points assigned to them. zcount_indices = (new_counts.glom() == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. new_counts[zcount_indices, :] = 1 new_centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices), num_dim) centers = new_centers / new_counts return labels
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10, M=None): ''' compute the factorization A = U M' using the alternating least-squares (ALS) method. where `A` is the "ratings" matrix which maps from a user and item to a rating score, `U` and `M` are the factor matrices, which represent user and item preferences. Args: A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score. la(float): the parameter of the als. alpha(int): confidence parameter used on implicit feedback. implicit_feedback(bool): whether using implicit_feedback method for als. num_features(int): dimension of the feature space. num_iter(int): max iteration to run. ''' num_users = A.shape[0] num_items = A.shape[1] AT = expr.transpose(A) avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0) M = expr.rand(num_items, num_features) M = expr.assign(M, np.s_[:, 0], avg_rating.reshape( (avg_rating.shape[0], 1))) #A = expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0)) #AT = expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0)) for i in range(num_iter): # Recomputing U shape = (num_users, num_features) U = expr.outer( (A, M), (0, None), fn=_solve_U_or_M_mapper, fn_kw={ 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback, 'shape': shape }, shape=shape, dtype=np.float) # Recomputing M shape = (num_items, num_features) M = expr.outer( (AT, U), (0, None), fn=_solve_U_or_M_mapper, fn_kw={ 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback, 'shape': shape }, shape=shape, dtype=np.float) return U, M
def run(N_EXAMPLES, N_DIM, iterations): x = expr.rand(N_EXAMPLES, N_DIM) y = expr.rand(N_EXAMPLES, 1) logistic_regression(x, y, iterations)
def learn_topics(terms_docs_matrix, k_topics, alpha=0.1, eta=0.1, max_iter=10, max_iter_per_doc=1): ''' Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model. Args: terms_docs_matrix(Expr or DistArray): the count of each term in each document. k_topics: the number of topics we need to find. alpha(float): parameter of LDA model. eta(float): parameter of LDA model. max_iter(int):the max iterations to train LDA topic model. max_iter_per_doc: the max iterations to train each document. ''' num_terms = terms_docs_matrix.shape[0] num_docs = terms_docs_matrix.shape[1] topic_term_counts = expr.rand(k_topics, num_terms) for i in range(max_iter): #topic_term_counts = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)), #_lda_mapper, #target=expr.ndarray((k_topics, num_terms), dtype=np.float64, reduce_fn=np.add), #kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc, #'topic_term_counts': topic_term_counts}).optimized() topic_term_counts = expr.outer( (terms_docs_matrix, topic_term_counts), (1, None), fn=_lda_mapper, fn_kw={ 'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc }, shape=(k_topics, num_terms), dtype=np.float64, reducer=np.add) # calculate the doc-topic inference #doc_topics = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)), #_lda_doc_topic_mapper, #kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc, #'topic_term_counts': topic_term_counts}, #shape_hint=(num_docs, k_topics)).optimized() doc_topics = expr.outer( (terms_docs_matrix, topic_term_counts), (1, None), fn=_lda_doc_topic_mapper, fn_kw={ 'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc }, shape=(num_docs, k_topics), dtype=np.float64) # normalize the topic-term distribution norm_val = expr.reduce( topic_term_counts, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: np.abs(data).sum(axis), accumulate_fn=np.add) topic_term_counts = topic_term_counts / norm_val.reshape((k_topics, 1)) return doc_topics, topic_term_counts
def run(N_EXAMPLES, N_DIM, iterations): x = expr.eager(expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / 10, 10))) y = expr.eager(expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / 10, 1))) linear_regression(x, y, iterations)
def benchmark_sort(ctx, timer): A = expr.rand(10, 10, 10).force() T = expr.sort(A) print np.all(np.equal(T.glom(), np.sort(A.glom(), axis=None)))
def test_kmeans_expr(self): FLAGS.opt_parakeet_gen = 0 pts = expr.rand(N_PTS, N_DIM) k = KMeans(N_CENTERS, ITER) k.fit(pts)
def fit(self, X, centers=None, implementation='map2'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0], )) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum( X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={ 'd_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels }, shape_hint=(1, ), cost_hint={ hash(labels): { '00': 0, '01': np.prod(labels.shape) } }) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn( n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def fit(self, X, centers=None, implementation='outer'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0],)) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={'d_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels}, shape_hint=(1,), cost_hint={hash(labels): {'00': 0, '01': np.prod(labels.shape)}}) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) labels = expr.zeros((points.shape[0], ), dtype=np.int) for iter in range(num_iter): centers = expr.as_array(centers) points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2) # This is used to avoid dividing zero distances = distances + 0.00000000001 util.log_info('distances shape %s' % str(distances.shape)) distances_broadcast = expr.reshape( distances, (distances.shape[0], 1, distances.shape[1])) distances_broadcast2 = expr.reshape( distances, (distances.shape[0], distances.shape[1], 1)) prob = 1.0 / expr.sum(expr.power( distances_broadcast / distances_broadcast2, 2.0 / (m - 1)), axis=2) prob.force() counts = expr.sum(prob, axis=0) counts = expr.reshape(counts, (counts.shape[0], 1)) labels = expr.argmax(prob, axis=1) centers = expr.sum( expr.reshape(points, (points.shape[0], 1, points.shape[1])) * expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)), axis=0) # We assume that the size of centers are relative small that can be handled # on the master. counts = counts.glom() centers = centers.glom() # If any centroids don't have any points assigned to them. zcount_indices = (counts == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. counts[zcount_indices, :] = 1 centers[zcount_indices, :] = np.random.rand( np.count_nonzero(zcount_indices), num_dim) centers = centers / counts return labels