def _step(): yp = expr.dot(x, w) Assert.all_eq(yp.shape, y.shape) diff = x * (yp - y) grad = expr.sum(diff, axis=0).glom().reshape((N_DIM, 1)) wprime = w - grad * 1e-6 expr.force(wprime)
def train_smo_1998(self, data, labels): ''' Train an SVM model using the SMO (1998) algorithm. Args: data(Expr): points to be trained labels(Expr): the correct labels of the training data ''' N = data.shape[0] # Number of instances D = data.shape[1] # Number of features self.b = 0.0 self.alpha = expr.zeros((N,1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]).force() # linear kernel kernel_results = expr.dot(data, expr.transpose(data), tile_hint=[N/self.ctx.num_workers, N]) labels = expr.force(labels) self.E = expr.zeros((N,1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]).force() for i in xrange(N): self.E[i, 0] = self.b + expr.reduce(self.alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict(label=labels, data=kernel_results[:,i].force())).glom() - labels[i, 0] util.log_info("Starting SMO") it = 0 num_changed = 0 examine_all = True while (num_changed > 0 or examine_all) and (it < self.maxiter): util.log_info("Iteration:%d", it) num_changed = 0 if examine_all: for i in xrange(N): num_changed += self.examine_example(i, N, labels, kernel_results) else: for i in xrange(N): if self.alpha[i, 0] > 0 and self.alpha[i, 0] < self.C: num_changed += self.examine_example(i, N, labels, kernel_results) it += 1 if examine_all: examine_all = False elif num_changed == 0: examine_all = True self.w = expr.zeros((D, 1), dtype=np.float64).force() for i in xrange(D): self.w[i,0] = expr.reduce(self.alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict(label=labels, data=expr.force(data[:,i]))).glom() self.usew_ = True print 'iteration finish:', it print 'b:', self.b print 'w:', self.w.glom()
def fit(self, X, y): """ Parameters ---------- X : array-like of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (integers that correspond to classes in classification, real numbers in regression). Returns ------- self : object Returns self. """ if isinstance(X, np.ndarray): X = expr.from_numpy(X) if isinstance(y, np.ndarray): y = expr.from_numpy(y) X = expr.force(X) y = expr.force(y) self.n_classes = np.unique(y.glom()).size ctx = blob_ctx.get() n_workers = ctx.num_workers _ = self._create_task_array(n_workers, self.n_estimators) task_array = expr.from_numpy(_, tile_hint=(1, )).force() target_array = expr.ndarray((task_array.shape[0], ), dtype=object, tile_hint=(1, )).force() results = task_array.foreach_tile(mapper_fn=_build_mapper, kw={ 'task_array': task_array, 'target_array': target_array, 'X': X, 'y': y, 'criterion': self.criterion, 'max_depth': self.max_depth, 'min_samples_split': self.min_samples_split, 'min_samples_leaf': self.min_samples_leaf, 'max_features': self.max_features, 'bootstrap': self.bootstrap }) # Target array stores the local random forest each worker builds, # it's used for further prediction. self.target_array = target_array return self
def _(): conv1 = stencil.stencil(images, w1, 2) pool1 = stencil.maxpool(conv1) conv2 = stencil.stencil(pool1, w2, 2) pool2 = stencil.maxpool(conv2) conv3 = stencil.stencil(pool2, w3, 2) pool3 = stencil.maxpool(conv3) expr.force(pool3)
def fit(self, X, y): """ Parameters ---------- X : array-like of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (integers that correspond to classes in classification, real numbers in regression). Returns ------- self : object Returns self. """ if isinstance(X, np.ndarray): X = expr.from_numpy(X) if isinstance(y, np.ndarray): y = expr.from_numpy(y) X = expr.force(X) y = expr.force(y) self.n_classes = np.unique(y.glom()).size ctx = blob_ctx.get() n_workers = ctx.num_workers _ = self._create_task_array(n_workers, self.n_estimators) task_array = expr.from_numpy(_, tile_hint=(1, )).force() target_array = expr.ndarray((task_array.shape[0], ), dtype=object, tile_hint=(1,)).force() results = task_array.foreach_tile(mapper_fn = _build_mapper, kw = {'task_array': task_array, 'target_array' : target_array, 'X' : X, 'y' : y, 'criterion' : self.criterion, 'max_depth' : self.max_depth, 'min_samples_split' : self.min_samples_split, 'min_samples_leaf' : self.min_samples_leaf, 'max_features' : self.max_features, 'bootstrap' : self.bootstrap}) # Target array stores the local random forest each worker builds, # it's used for further prediction. self.target_array = target_array return self
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) #labels = expr.zeros((points.shape[0],), dtype=np.int) for iter in range(num_iter): centers = centers.glom() fuzzy = expr.map2(points, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers, "m": m}, shape=(points.shape[0], centers.shape[0])) labels = expr.argmax(fuzzy, axis=1) new_centers = expr.map2((points, fuzzy), (0, 0), fn=kmeans_map2_center_mapper, fn_kw={"centers": centers, "m": m}, shape=(centers.shape[0], centers.shape[1]), reducer=np.add) new_centers /= expr.sum(fuzzy ** m, axis=0)[:, expr.newaxis] centers = new_centers return labels
def cholesky(A): ''' Cholesky matrix decomposition. Args: A(Expr): matrix to be decomposed ''' A = expr.force(A) n = int(math.sqrt(len(A.tiles))) tile_size = A.shape[0] / n for k in range(n): # A[k,k] = DPOTRF(A[k,k]) diag_ex = get_ex(k, k, tile_size, A.shape) A = expr.region_map(A, diag_ex, _cholesky_dpotrf_mapper) if k == n - 1: break # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n) col_ex = extent.create(((k+1)*tile_size, k*tile_size),(n*tile_size, (k+1)*tile_size), A.shape) A = expr.region_map(A, col_ex, _cholesky_dtrsm_mapper, fn_kw=dict(diag_ex=diag_ex)) # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n) # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n) col_exs = list([extent.create((m*tile_size, m*tile_size), (n*tile_size, (m+1)*tile_size), A.shape) for m in range(k+1,n)]) A = expr.region_map(A, col_exs, _cholesky_dsyrk_dgemm_mapper, fn_kw=dict(k=k)) # update the right corner to 0 col_exs = list([extent.create((0, m*tile_size),(m*tile_size, (m+1)*tile_size),A.shape) for m in range(1,n)]) A = expr.region_map(A, col_exs, lambda input, array, ex: np.zeros(input.shape, input.dtype)) return A
def fit(data, labels, label_size, alpha=1.0): ''' Train standard naive bayes model. Args: data(Expr): documents to be trained. labels(Expr): the correct labels of the training data. label_size(int): the number of different labels. alpha(float): alpha parameter of naive bayes model. ''' labels = expr.force(labels) # calc document freq df = expr.reduce(data, axis=0, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis), accumulate_fn=np.add, tile_hint=(data.shape[1],)) idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1 # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency # by the root mean square of features frequencies in that document square_sum = expr.reduce(data, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis), accumulate_fn=np.add, tile_hint=(data.shape[0],)) rms = expr.sqrt(square_sum * 1.0 / data.shape[1]) # calculate weight normalized Tf-Idf data = data / rms.reshape((data.shape[0], 1)) * idf.reshape((1, data.shape[1])) # add up all the feature vectors with the same labels sum_instance_by_label = expr.ndarray((label_size, data.shape[1]), dtype=np.float64, reduce_fn=np.add, tile_hint=(label_size / len(labels.tiles), data.shape[1])) sum_instance_by_label = expr.shuffle(data, _sum_instance_by_label_mapper, target=sum_instance_by_label, kw={'labels': labels, 'label_size': label_size}) # sum up all the weights for each label from the previous step weights_per_label = expr.sum(sum_instance_by_label, axis=1, tile_hint=(label_size,)) # generate naive bayes per_label_and_feature weights weights_per_label_and_feature = expr.shuffle(sum_instance_by_label, _naive_bayes_mapper, kw={'weights_per_label': weights_per_label, 'alpha':alpha}) return {'scores_per_label_and_feature': weights_per_label_and_feature.force(), 'scores_per_label': weights_per_label.force(), }
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) labels = expr.zeros((points.shape[0],), dtype=np.int) for iter in range(num_iter): centers = expr.as_array(centers) points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2) # This is used to avoid dividing zero distances = distances + 0.00000000001 util.log_info('distances shape %s' % str(distances.shape)) distances_broadcast = expr.reshape(distances, (distances.shape[0], 1, distances.shape[1])) distances_broadcast2 = expr.reshape(distances, (distances.shape[0], distances.shape[1], 1)) prob = 1.0 / expr.sum(expr.power(distances_broadcast / distances_broadcast2, 2.0 / (m - 1)), axis=2) prob.force() counts = expr.sum(prob, axis=0) counts = expr.reshape(counts, (counts.shape[0], 1)) labels = expr.argmax(prob, axis=1) centers = expr.sum(expr.reshape(points, (points.shape[0], 1, points.shape[1])) * expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)), axis=0) # We assume that the size of centers are relative small that can be handled # on the master. counts = counts.glom() centers = centers.glom() # If any centroids don't have any points assigned to them. zcount_indices = (counts == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. counts[zcount_indices, :] = 1 centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices), num_dim) centers = centers / counts return labels
def cholesky(A): ''' Cholesky matrix decomposition. Args: A(Expr): matrix to be decomposed ''' A = expr.force(A) n = int(math.sqrt(len(A.tiles))) tile_size = A.shape[0] / n for k in range(n): # A[k,k] = DPOTRF(A[k,k]) diag_ex = get_ex(k, k, tile_size, A.shape) A = expr.map2(A, ((0, 1), ), fn=_cholesky_dpotrf_mapper, shape=A.shape, update_region=diag_ex) if k == n - 1: break # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n) col_ex = extent.create(((k + 1) * tile_size, k * tile_size), (n * tile_size, (k + 1) * tile_size), A.shape) diag_tile = A.force().fetch(diag_ex) A = expr.map2(A, ((0, 1), ), fn=_cholesky_dtrsm_mapper, fn_kw=dict(array=force(A), diag_tile=diag_tile), shape=A.shape, update_region=col_ex) # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n) # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n) col_exs = list([ extent.create((m * tile_size, m * tile_size), (n * tile_size, (m + 1) * tile_size), A.shape) for m in range(k + 1, n) ]) dgemm_1 = expr.transpose(A)[(k * tile_size):((k + 1) * tile_size), :] dgemm_2 = A[:, (k * tile_size):((k + 1) * tile_size)] A = expr.map2((A, dgemm_1, dgemm_2), ((0, 1), 1, 0), fn=_cholesky_dsyrk_dgemm_mapper, fn_kw=dict(array=force(A), k=k), shape=A.shape, update_region=col_exs) # update the right corner to 0 col_exs = list([ extent.create((0, m * tile_size), (m * tile_size, (m + 1) * tile_size), A.shape) for m in range(1, n) ]) A = expr.map2(A, ((0, 1), ), fn=_zero_mapper, shape=A.shape, update_region=col_exs) return A
def precompute(self): '''Precompute the most k similar items for each item. After this funcion returns. 2 attributes will be created. Attributes ------ top_k_similar_table : Numpy array of shape (N, k). Records the most k similar scores between each items. top_k_similar_indices : Numpy array of shape (N, k). Records the indices of most k similar items for each item. ''' M = self.rating_table.shape[0] N = self.rating_table.shape[1] self.rating_table = expr.force(self.rating_table) assert self.rating_table.tile_shape()[0] == M, \ "rating table is only allowed to tile by columns!" self.similarity_table = expr.zeros(shape=(N, N), tile_hint=(self.rating_table.tile_shape()[1], N)).force() self.item_norm = self._get_norm_of_each_item(self.rating_table) self.rating_table.foreach_tile(mapper_fn=_similarity_mapper, kw={'rating_table' : self.rating_table, 'similarity_table' : self.similarity_table, 'item_norm' : self.item_norm, 'step' : self.rating_table.tile_shape()[1]}) # Release the memory for item_norm self.item_norm = None k = self.k top_k_similar_table = expr.zeros((N, k), tile_hint=(self.rating_table.tile_shape()[1], k)).force() top_k_similar_indices = expr.zeros((N, k), tile_hint=(self.rating_table.tile_shape()[1], k), dtype=np.int).force() # Find top-k similar items for each item. # Store the similarity scores into table top_k_similar table. # Store the indices of top k items into table top_k_similar_indices. self.similarity_table.foreach_tile(mapper_fn=_select_most_k_similar_mapper, kw={'similarity_table' : self.similarity_table, 'top_k_similar_table' : top_k_similar_table, 'top_k_similar_indices' : top_k_similar_indices, 'k' : k}) self.top_k_similar_table = top_k_similar_table.glom() self.top_k_similar_indices = top_k_similar_indices.glom()
def fit(self, X, centers = None): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ X = expr.force(X) num_dim = X.shape[1] labels = expr.zeros((X.shape[0],1), dtype=np.int, tile_hint=X.tile_shape()) if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={'d_pts' : X, 'old_centers' : centers, 'new_centers' : new_centers, 'new_counts' : new_counts, 'labels': labels }) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def conj_gradient(A, num_iter=15): ''' NAS Conjugate Gradient benchmark This function is similar to the NAS CG benchmark described in: http://www.nas.nasa.gov/News/Techreports/1994/PDF/RNR-94-007.pdf See code on page 19-20 for the pseudo code. Args: A(Expr): matrix to be processed. num_iter(int): max iteration to run. ''' A = expr.force(A) x = expr.ones((A.shape[1],1), tile_hint=(A.tile_shape()[1], 1)) for iter in range(num_iter): #util.log_warn('iteration:%d', iter) z = cgit(A, x) x = z / expr.norm(z) return x
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim, tile_hint=(k, num_dim)) labels = expr.zeros((points.shape[0],), dtype=np.int, tile_hint=(points.shape[0]/len(points.tiles),)) for iter in range(num_iter): new_centers = expr.ndarray((k, num_dim), reduce_fn=lambda a, b: a + b, tile_hint=(k, num_dim)) new_counts = expr.ndarray((k, 1), dtype=np.float, reduce_fn=lambda a, b: a + b, tile_hint=(k, 1)) expr.shuffle(points, _fuzzy_kmeans_mapper, kw={'old_centers': centers, 'centers': new_centers, 'counts': new_counts, 'labels': labels, 'm': m}).force() # If any centroids don't have any points assigned to them. zcount_indices = (new_counts.glom() == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. new_counts[zcount_indices, :] = 1 new_centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices), num_dim) centers = new_centers / new_counts return labels
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10): ''' compute the factorization A = U M' using the alternating least-squares (ALS) method. where `A` is the "ratings" matrix which maps from a user and item to a rating score, `U` and `M` are the factor matrices, which represent user and item preferences. Args: A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score. la(float): the parameter of the als. alpha(int): confidence parameter used on implicit feedback. implicit_feedback(bool): whether using implicit_feedback method for als. num_features(int): dimension of the feature space. num_iter(int): max iteration to run. ''' A = expr.force(A) AT = expr.shuffle(expr.ndarray((A.shape[1], A.shape[0]), dtype=A.dtype, tile_hint=(A.shape[1] / len(A.tiles), A.shape[0])), _transpose_mapper, kw={'orig_array': A}) num_items = A.shape[1] avg_rating = expr.sum(A, axis=0, tile_hint=(num_items / len(A.tiles),)) * 1.0 / \ expr.count_nonzero(A, axis=0, tile_hint=(num_items / len(A.tiles),)) M = expr.shuffle(expr.ndarray((num_items, num_features), tile_hint=(num_items / len(A.tiles), num_features)), _init_M_mapper, kw={'avg_rating': avg_rating}) #util.log_warn('avg_rating:%s M:%s', avg_rating.glom(), M.glom()) for i in range(num_iter): # Recomputing U U = expr.shuffle(A, _solve_U_or_M_mapper, kw={'U_or_M': M, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}) # Recomputing M M = expr.shuffle(AT, _solve_U_or_M_mapper, kw={'U_or_M': U, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}) return U, M
def train_smo_1998(self, data, labels): ''' Train an SVM model using the SMO (1998) algorithm. Args: data(Expr): points to be trained labels(Expr): the correct labels of the training data ''' N = data.shape[0] # Number of instances D = data.shape[1] # Number of features self.b = 0.0 self.alpha = expr.zeros((N, 1), dtype=np.float64, tile_hint=[N / self.ctx.num_workers, 1]).force() # linear kernel kernel_results = expr.dot(data, expr.transpose(data), tile_hint=[N / self.ctx.num_workers, N]) labels = expr.force(labels) self.E = expr.zeros((N, 1), dtype=np.float64, tile_hint=[N / self.ctx.num_workers, 1]).force() for i in xrange(N): self.E[i, 0] = self.b + expr.reduce( self.alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict( label=labels, data=kernel_results[:, i].force())).glom() - labels[i, 0] util.log_info("Starting SMO") it = 0 num_changed = 0 examine_all = True while (num_changed > 0 or examine_all) and (it < self.maxiter): util.log_info("Iteration:%d", it) num_changed = 0 if examine_all: for i in xrange(N): num_changed += self.examine_example( i, N, labels, kernel_results) else: for i in xrange(N): if self.alpha[i, 0] > 0 and self.alpha[i, 0] < self.C: num_changed += self.examine_example( i, N, labels, kernel_results) it += 1 if examine_all: examine_all = False elif num_changed == 0: examine_all = True self.w = expr.zeros((D, 1), dtype=np.float64).force() for i in xrange(D): self.w[i, 0] = expr.reduce(self.alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict(label=labels, data=expr.force( data[:, i]))).glom() self.usew_ = True print 'iteration finish:', it print 'b:', self.b print 'w:', self.w.glom()
def _step(): y = expr.force(x * x)
def bs_step(current, strike, maturity, rate, volatility): put, call = finance.black_scholes(current, strike, maturity, rate, volatility) call = call.optimized() force(call)
def train_smo_2005(self, data, labels): ''' Train an SVM model using the SMO (2005) algorithm. Args: data(Expr): points to be trained labels(Expr): the correct labels of the training data ''' N = data.shape[0] # Number of instances D = data.shape[1] # Number of features self.b = 0.0 alpha = expr.zeros((N, 1), dtype=np.float64, tile_hint=[N / self.ctx.num_workers, 1]).force() # linear kernel kernel_results = expr.dot(data, expr.transpose(data), tile_hint=[N / self.ctx.num_workers, N]) gradient = expr.ones( (N, 1), dtype=np.float64, tile_hint=[N / self.ctx.num_workers, 1 ]) * -1.0 expr_labels = expr.lazify(labels) util.log_info("Starting SMO") pv1 = pv2 = -1 it = 0 while it < self.maxiter: util.log_info("Iteration:%d", it) minObj = 1e100 expr_alpha = expr.lazify(alpha) G = expr.multiply(labels, gradient) * -1.0 v1_mask = ((expr_labels > self.tol) * (expr_alpha < self.C) + (expr_labels < -self.tol) * (expr_alpha > self.tol)) v1 = expr.argmax(G[v1_mask - True]).glom().item() maxG = G[v1, 0].glom() print 'maxv1:', v1, 'maxG:', maxG v2_mask = ((expr_labels > self.tol) * (expr_alpha > self.tol) + (expr_labels < -self.tol) * (expr_alpha < self.C)) min_v2 = expr.argmin(G[v2_mask - True]).glom().item() minG = G[min_v2, 0].glom() #print 'minv2:', min_v2, 'minG:', minG set_v2 = v2_mask.glom().nonzero()[0] #print 'actives:', set_v2.shape[0] v2 = -1 for v in set_v2: b = maxG - G[v, 0].glom() if b > self.tol: na = (kernel_results[v1, v1] + kernel_results[v, v] - 2 * kernel_results[v1, v]).glom()[0][0] if na < self.tol: na = 1e12 obj = -(b * b) / na if obj <= minObj and v1 != pv1 or v != pv2: v2 = v a = na minObj = obj if v2 == -1: break if maxG - minG < self.tol: break print 'opt v1:', v1, 'v2:', v2 pv1 = v1 pv2 = v2 y1 = labels[v1, 0] y2 = labels[v2, 0] oldA1 = alpha[v1, 0] oldA2 = alpha[v2, 0] # Calculate new alpha values, to reduce the objective function... b = y2 * expr.glom(gradient[v2, 0]) - y1 * expr.glom(gradient[v1, 0]) if y1 != y2: a += 4 * kernel_results[v1, v2].glom() newA1 = oldA1 + y1 * b / a newA2 = oldA2 - y2 * b / a # Correct for alpha being out of range... sum = y1 * oldA1 + y2 * oldA2 if newA1 < self.tol: newA1 = 0.0 elif newA1 > self.C: newA1 = self.C newA2 = y2 * (sum - y1 * newA1) if newA2 < self.tol: newA2 = 0.0 elif newA2 > self.C: newA2 = self.C newA1 = y1 * (sum - y2 * newA2) # Update the gradient... dA1 = newA1 - oldA1 dA2 = newA2 - oldA2 gradient += expr.multiply( labels, kernel_results[:, v1]) * y1 * dA1 + expr.multiply( labels, kernel_results[:, v2]) * y2 * dA2 alpha[v1, 0] = newA1 alpha[v2, 0] = newA2 #print 'alpha:', alpha.glom().T it += 1 #print 'gradient:', gradient.glom().T self.w = expr.zeros((D, 1), dtype=np.float64).force() for i in xrange(D): self.w[i, 0] = expr.reduce(alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict(label=labels, data=expr.force( data[:, i]))).glom() self.b = 0.0 E = (labels - self.margins(data)).force() minB = -1e100 maxB = 1e100 actualB = 0.0 numActualB = 0 for i in xrange(N): ai = alpha[i, 0] yi = labels[i, 0] Ei = E[i, 0] if ai < 1e-3: if yi < self.tol: maxB = min((maxB, Ei)) else: minB = max((minB, Ei)) elif ai > self.C - 1e-3: if yi < self.tol: minB = max((minB, Ei)) else: maxB = min((maxB, Ei)) else: numActualB += 1 actualB += (Ei - actualB) / float(numActualB) if numActualB > 0: self.b = actualB else: self.b = 0.5 * (minB + maxB) self.usew_ = True print 'iteration finish:', it print 'b:', self.b print 'w:', self.w.glom()
def test_slice_get(self): x = expr.arange((TEST_SIZE, TEST_SIZE)) z = x[5:8, 5:8] val = expr.force(z) nx = np.arange(TEST_SIZE*TEST_SIZE).reshape(TEST_SIZE, TEST_SIZE) Assert.all_eq(val.glom(), nx[5:8, 5:8])
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) labels = expr.zeros((points.shape[0], ), dtype=np.int) for iter in range(num_iter): centers = expr.as_array(centers) points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2) # This is used to avoid dividing zero distances = distances + 0.00000000001 util.log_info('distances shape %s' % str(distances.shape)) distances_broadcast = expr.reshape( distances, (distances.shape[0], 1, distances.shape[1])) distances_broadcast2 = expr.reshape( distances, (distances.shape[0], distances.shape[1], 1)) prob = 1.0 / expr.sum(expr.power( distances_broadcast / distances_broadcast2, 2.0 / (m - 1)), axis=2) prob.force() counts = expr.sum(prob, axis=0) counts = expr.reshape(counts, (counts.shape[0], 1)) labels = expr.argmax(prob, axis=1) centers = expr.sum( expr.reshape(points, (points.shape[0], 1, points.shape[1])) * expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)), axis=0) # We assume that the size of centers are relative small that can be handled # on the master. counts = counts.glom() centers = centers.glom() # If any centroids don't have any points assigned to them. zcount_indices = (counts == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. counts[zcount_indices, :] = 1 centers[zcount_indices, :] = np.random.rand( np.count_nonzero(zcount_indices), num_dim) centers = centers / counts return labels
def train_smo_2005(self, data, labels): ''' Train an SVM model using the SMO (2005) algorithm. Args: data(Expr): points to be trained labels(Expr): the correct labels of the training data ''' N = data.shape[0] # Number of instances D = data.shape[1] # Number of features self.b = 0.0 alpha = expr.zeros((N,1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]).force() # linear kernel kernel_results = expr.dot(data, expr.transpose(data), tile_hint=[N/self.ctx.num_workers, N]) gradient = expr.ones((N, 1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]) * -1.0 expr_labels = expr.lazify(labels) util.log_info("Starting SMO") pv1 = pv2 = -1 it = 0 while it < self.maxiter: util.log_info("Iteration:%d", it) minObj = 1e100 expr_alpha = expr.lazify(alpha) G = expr.multiply(labels, gradient) * -1.0 v1_mask = ((expr_labels > self.tol) * (expr_alpha < self.C) + (expr_labels < -self.tol) * (expr_alpha > self.tol)) v1 = expr.argmax(G[v1_mask-True]).glom().item() maxG = G[v1,0].glom() print 'maxv1:', v1, 'maxG:', maxG v2_mask = ((expr_labels > self.tol) * (expr_alpha > self.tol) + (expr_labels < -self.tol) * (expr_alpha < self.C)) min_v2 = expr.argmin(G[v2_mask-True]).glom().item() minG = G[min_v2,0].glom() #print 'minv2:', min_v2, 'minG:', minG set_v2 = v2_mask.glom().nonzero()[0] #print 'actives:', set_v2.shape[0] v2 = -1 for v in set_v2: b = maxG - G[v,0].glom() if b > self.tol: na = (kernel_results[v1,v1] + kernel_results[v,v] - 2*kernel_results[v1,v]).glom()[0][0] if na < self.tol: na = 1e12 obj = -(b*b)/na if obj <= minObj and v1 != pv1 or v != pv2: v2 = v a = na minObj = obj if v2 == -1: break if maxG - minG < self.tol: break print 'opt v1:', v1, 'v2:', v2 pv1 = v1 pv2 = v2 y1 = labels[v1,0] y2 = labels[v2,0] oldA1 = alpha[v1,0] oldA2 = alpha[v2,0] # Calculate new alpha values, to reduce the objective function... b = y2*expr.glom(gradient[v2,0]) - y1*expr.glom(gradient[v1,0]) if y1 != y2: a += 4 * kernel_results[v1,v2].glom() newA1 = oldA1 + y1*b/a newA2 = oldA2 - y2*b/a # Correct for alpha being out of range... sum = y1*oldA1 + y2*oldA2; if newA1 < self.tol: newA1 = 0.0 elif newA1 > self.C: newA1 = self.C newA2 = y2 * (sum - y1 * newA1) if newA2 < self.tol: newA2 = 0.0; elif newA2 > self.C: newA2 = self.C newA1 = y1 * (sum - y2 * newA2) # Update the gradient... dA1 = newA1 - oldA1 dA2 = newA2 - oldA2 gradient += expr.multiply(labels, kernel_results[:,v1]) * y1 * dA1 + expr.multiply(labels, kernel_results[:,v2]) * y2 * dA2 alpha[v1,0] = newA1 alpha[v2,0] = newA2 #print 'alpha:', alpha.glom().T it += 1 #print 'gradient:', gradient.glom().T self.w = expr.zeros((D, 1), dtype=np.float64).force() for i in xrange(D): self.w[i,0] = expr.reduce(alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict(label=labels, data=expr.force(data[:,i]))).glom() self.b = 0.0 E = (labels - self.margins(data)).force() minB = -1e100 maxB = 1e100 actualB = 0.0 numActualB = 0 for i in xrange(N): ai = alpha[i,0] yi = labels[i,0] Ei = E[i,0] if ai < 1e-3: if yi < self.tol: maxB = min((maxB,Ei)) else: minB = max((minB,Ei)) elif ai > self.C - 1e-3: if yi < self.tol: minB = max((minB,Ei)) else: maxB = min((maxB,Ei)) else: numActualB += 1 actualB += (Ei - actualB) / float(numActualB) if numActualB > 0: self.b = actualB else: self.b = 0.5*(minB + maxB) self.usew_ = True print 'iteration finish:', it print 'b:', self.b print 'w:', self.w.glom()
def solve(A, AT, desired_rank, is_symmetric=False): ''' A simple implementation of the Lanczos algorithm (http://en.wikipedia.org/wiki/Lanczos_algorithm) for eigenvalue computation. Like the Mahout implementation, only the matrix*vector step is parallelized. First we use lanczos method to turn the matrix into tridiagonoal form. Then we use numpy.linalg.eig function to extract the eigenvalues and eigenvectors from the tridiagnonal matrix(desired_rank*desired_rank). Since desired_rank should be smaller than the size of matrix, so we could it in local machine efficiently. ''' A = expr.force(A) AT = expr.force(AT) ctx = blob_ctx.get() # Calculate two more eigenvalues, but we only keep the largest desired_rank # one. Doing this to keep the result consistent with scipy.sparse.linalg.svds. desired_rank += 2 n = A.shape[1] v_next = np.ones(n) / np.sqrt(n) v_prev = np.zeros(n) beta = np.zeros(desired_rank+1) beta[0] = 0 alpha = np.zeros(desired_rank) # Since the disiredRank << size of matrix, so we keep # V in local memory for efficiency reason(It needs to be updated # for every iteration). # If the case which V can't be fit in local memory occurs, # you could turn it into spartan distributed array. V = np.zeros((n, desired_rank)) for i in range(0, desired_rank): util.log_info("Iter : %s", i) v_next_expr = expr.from_numpy(v_next.reshape(n, 1), tile_hint=(n/ctx.num_workers, 1)) if is_symmetric: w = expr.dot(A, v_next_expr).glom().reshape(n) else: w = expr.dot(A, v_next_expr, tile_hint=(min(*A.tile_shape()), 1)).force() w = expr.dot(AT, w, tile_hint=(min(*A.tile_shape()), 1)).glom().reshape(n) alpha[i] = np.dot(w, v_next) w = w - alpha[i] * v_next - beta[i] * v_prev # Orthogonalize: for t in range(i): tmpa = np.dot(w, V[:, t]) if tmpa == 0.0: continue w -= tmpa * V[:, t] beta[i+1] = np.linalg.norm(w, 2) v_prev = v_next v_next = w / beta[i+1] V[:, i] = v_prev # Create tridiag matrix with size (desired_rank X desired_rank) tridiag = np.diag(alpha) for i in range(0, desired_rank-1): tridiag[i, i+1] = beta[i+1] tridiag[i+1, i] = beta[i+1] # Get eigenvectors and eigenvalues of this tridiagonal matrix. # The eigenvalues of this tridiagnoal matrix equals to the eigenvalues # of matrix dot(A, A.T.). We can get the eigenvectors of dot(A, A.T) # by multiplying V with eigenvectors of this tridiagonal matrix. d, v = np.linalg.eig(tridiag) # Sort eigenvalues and their corresponding eigenvectors sorted_idx = np.argsort(np.absolute(d))[::-1] d = d[sorted_idx] v = v[:, sorted_idx] # Get the eigenvetors of dot(A, A.T) s = np.dot(V, v) return d[0:desired_rank-2], s[:, 0:desired_rank-2]