def fit(self, X, centers=None): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if centers is None: centers = expr.from_numpy(np.random.rand(self.n_clusters, num_dim)) for i in range(self.n_iter): X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels '''
def train_smo_2005(self, data, labels): ''' Train an SVM model using the SMO (2005) algorithm. Args: data(Expr): points to be trained labels(Expr): the correct labels of the training data ''' N = data.shape[0] # Number of instances D = data.shape[1] # Number of features self.b = 0.0 alpha = expr.zeros((N,1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]).force() # linear kernel kernel_results = expr.dot(data, expr.transpose(data), tile_hint=[N/self.ctx.num_workers, N]) gradient = expr.ones((N, 1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]) * -1.0 expr_labels = expr.lazify(labels) util.log_info("Starting SMO") pv1 = pv2 = -1 it = 0 while it < self.maxiter: util.log_info("Iteration:%d", it) minObj = 1e100 expr_alpha = expr.lazify(alpha) G = expr.multiply(labels, gradient) * -1.0 v1_mask = ((expr_labels > self.tol) * (expr_alpha < self.C) + (expr_labels < -self.tol) * (expr_alpha > self.tol)) v1 = expr.argmax(G[v1_mask-True]).glom().item() maxG = G[v1,0].glom() print 'maxv1:', v1, 'maxG:', maxG v2_mask = ((expr_labels > self.tol) * (expr_alpha > self.tol) + (expr_labels < -self.tol) * (expr_alpha < self.C)) min_v2 = expr.argmin(G[v2_mask-True]).glom().item() minG = G[min_v2,0].glom() #print 'minv2:', min_v2, 'minG:', minG set_v2 = v2_mask.glom().nonzero()[0] #print 'actives:', set_v2.shape[0] v2 = -1 for v in set_v2: b = maxG - G[v,0].glom() if b > self.tol: na = (kernel_results[v1,v1] + kernel_results[v,v] - 2*kernel_results[v1,v]).glom()[0][0] if na < self.tol: na = 1e12 obj = -(b*b)/na if obj <= minObj and v1 != pv1 or v != pv2: v2 = v a = na minObj = obj if v2 == -1: break if maxG - minG < self.tol: break print 'opt v1:', v1, 'v2:', v2 pv1 = v1 pv2 = v2 y1 = labels[v1,0] y2 = labels[v2,0] oldA1 = alpha[v1,0] oldA2 = alpha[v2,0] # Calculate new alpha values, to reduce the objective function... b = y2*expr.glom(gradient[v2,0]) - y1*expr.glom(gradient[v1,0]) if y1 != y2: a += 4 * kernel_results[v1,v2].glom() newA1 = oldA1 + y1*b/a newA2 = oldA2 - y2*b/a # Correct for alpha being out of range... sum = y1*oldA1 + y2*oldA2; if newA1 < self.tol: newA1 = 0.0 elif newA1 > self.C: newA1 = self.C newA2 = y2 * (sum - y1 * newA1) if newA2 < self.tol: newA2 = 0.0; elif newA2 > self.C: newA2 = self.C newA1 = y1 * (sum - y2 * newA2) # Update the gradient... dA1 = newA1 - oldA1 dA2 = newA2 - oldA2 gradient += expr.multiply(labels, kernel_results[:,v1]) * y1 * dA1 + expr.multiply(labels, kernel_results[:,v2]) * y2 * dA2 alpha[v1,0] = newA1 alpha[v2,0] = newA2 #print 'alpha:', alpha.glom().T it += 1 #print 'gradient:', gradient.glom().T self.w = expr.zeros((D, 1), dtype=np.float64).force() for i in xrange(D): self.w[i,0] = expr.reduce(alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict(label=labels, data=expr.force(data[:,i]))).glom() self.b = 0.0 E = (labels - self.margins(data)).force() minB = -1e100 maxB = 1e100 actualB = 0.0 numActualB = 0 for i in xrange(N): ai = alpha[i,0] yi = labels[i,0] Ei = E[i,0] if ai < 1e-3: if yi < self.tol: maxB = min((maxB,Ei)) else: minB = max((minB,Ei)) elif ai > self.C - 1e-3: if yi < self.tol: minB = max((minB,Ei)) else: maxB = min((maxB,Ei)) else: numActualB += 1 actualB += (Ei - actualB) / float(numActualB) if numActualB > 0: self.b = actualB else: self.b = 0.5*(minB + maxB) self.usew_ = True print 'iteration finish:', it print 'b:', self.b print 'w:', self.w.glom()
def fit(self, X, centers=None, implementation='outer'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0],)) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={'d_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels}, shape_hint=(1,), cost_hint={hash(labels): {'00': 0, '01': np.prod(labels.shape)}}) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def fit(self, X, centers=None, implementation='map2'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0], )) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum( X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={ 'd_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels }, shape_hint=(1, ), cost_hint={ hash(labels): { '00': 0, '01': np.prod(labels.shape) } }) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn( n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def train_smo_2005(self, data, labels): ''' Train an SVM model using the SMO (2005) algorithm. Args: data(Expr): points to be trained labels(Expr): the correct labels of the training data ''' N = data.shape[0] # Number of instances D = data.shape[1] # Number of features self.b = 0.0 alpha = expr.zeros((N, 1), dtype=np.float64, tile_hint=[N / self.ctx.num_workers, 1]).force() # linear kernel kernel_results = expr.dot(data, expr.transpose(data), tile_hint=[N / self.ctx.num_workers, N]) gradient = expr.ones( (N, 1), dtype=np.float64, tile_hint=[N / self.ctx.num_workers, 1 ]) * -1.0 expr_labels = expr.lazify(labels) util.log_info("Starting SMO") pv1 = pv2 = -1 it = 0 while it < self.maxiter: util.log_info("Iteration:%d", it) minObj = 1e100 expr_alpha = expr.lazify(alpha) G = expr.multiply(labels, gradient) * -1.0 v1_mask = ((expr_labels > self.tol) * (expr_alpha < self.C) + (expr_labels < -self.tol) * (expr_alpha > self.tol)) v1 = expr.argmax(G[v1_mask - True]).glom().item() maxG = G[v1, 0].glom() print 'maxv1:', v1, 'maxG:', maxG v2_mask = ((expr_labels > self.tol) * (expr_alpha > self.tol) + (expr_labels < -self.tol) * (expr_alpha < self.C)) min_v2 = expr.argmin(G[v2_mask - True]).glom().item() minG = G[min_v2, 0].glom() #print 'minv2:', min_v2, 'minG:', minG set_v2 = v2_mask.glom().nonzero()[0] #print 'actives:', set_v2.shape[0] v2 = -1 for v in set_v2: b = maxG - G[v, 0].glom() if b > self.tol: na = (kernel_results[v1, v1] + kernel_results[v, v] - 2 * kernel_results[v1, v]).glom()[0][0] if na < self.tol: na = 1e12 obj = -(b * b) / na if obj <= minObj and v1 != pv1 or v != pv2: v2 = v a = na minObj = obj if v2 == -1: break if maxG - minG < self.tol: break print 'opt v1:', v1, 'v2:', v2 pv1 = v1 pv2 = v2 y1 = labels[v1, 0] y2 = labels[v2, 0] oldA1 = alpha[v1, 0] oldA2 = alpha[v2, 0] # Calculate new alpha values, to reduce the objective function... b = y2 * expr.glom(gradient[v2, 0]) - y1 * expr.glom(gradient[v1, 0]) if y1 != y2: a += 4 * kernel_results[v1, v2].glom() newA1 = oldA1 + y1 * b / a newA2 = oldA2 - y2 * b / a # Correct for alpha being out of range... sum = y1 * oldA1 + y2 * oldA2 if newA1 < self.tol: newA1 = 0.0 elif newA1 > self.C: newA1 = self.C newA2 = y2 * (sum - y1 * newA1) if newA2 < self.tol: newA2 = 0.0 elif newA2 > self.C: newA2 = self.C newA1 = y1 * (sum - y2 * newA2) # Update the gradient... dA1 = newA1 - oldA1 dA2 = newA2 - oldA2 gradient += expr.multiply( labels, kernel_results[:, v1]) * y1 * dA1 + expr.multiply( labels, kernel_results[:, v2]) * y2 * dA2 alpha[v1, 0] = newA1 alpha[v2, 0] = newA2 #print 'alpha:', alpha.glom().T it += 1 #print 'gradient:', gradient.glom().T self.w = expr.zeros((D, 1), dtype=np.float64).force() for i in xrange(D): self.w[i, 0] = expr.reduce(alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict(label=labels, data=expr.force( data[:, i]))).glom() self.b = 0.0 E = (labels - self.margins(data)).force() minB = -1e100 maxB = 1e100 actualB = 0.0 numActualB = 0 for i in xrange(N): ai = alpha[i, 0] yi = labels[i, 0] Ei = E[i, 0] if ai < 1e-3: if yi < self.tol: maxB = min((maxB, Ei)) else: minB = max((minB, Ei)) elif ai > self.C - 1e-3: if yi < self.tol: minB = max((minB, Ei)) else: maxB = min((maxB, Ei)) else: numActualB += 1 actualB += (Ei - actualB) / float(numActualB) if numActualB > 0: self.b = actualB else: self.b = 0.5 * (minB + maxB) self.usew_ = True print 'iteration finish:', it print 'b:', self.b print 'w:', self.w.glom()