Esempio n. 1
0
  def test_kmeans_expr(self):
    ctx = spartan.blob_ctx.get()
    pts = expr.rand(N_PTS, N_DIM,
                  tile_hint=(divup(N_PTS, ctx.num_workers), N_DIM)).force()

    k = KMeans(N_CENTERS, ITER)
    k.fit(pts)
Esempio n. 2
0
def benchmark_kmeans(ctx, timer):
  print "#worker:", ctx.num_workers
  N_PTS = 1000 * 256
  N_CENTERS = 10
  N_DIM = 512
  ITER = 1
  pts = expr.rand(N_PTS, N_DIM)
  k = KMeans(N_CENTERS, ITER)
  t1 = datetime.now()
  k.fit(pts)
  t2 = datetime.now()
  cost_time = millis(t1, t2)
  print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/ITER)
Esempio n. 3
0
def benchmark_kmeans(ctx, timer):
    print "#worker:", ctx.num_workers
    N_PTS = 1000 * 256
    N_CENTERS = 10
    N_DIM = 512
    ITER = 1
    pts = expr.rand(N_PTS, N_DIM)
    k = KMeans(N_CENTERS, ITER)
    t1 = datetime.now()
    k.fit(pts)
    t2 = datetime.now()
    cost_time = millis(t1, t2)
    print "total cost time:%s ms, per iter cost time:%s ms" % (
        cost_time, cost_time / ITER)
Esempio n. 4
0
  def test_kmeans(self):
    _skip_if_travis()
    N_PTS = 1000 * 1000 * self.ctx.num_workers
    ITER = 5
    N_DIM = 10
    N_CENTERS = 10
    
    start = time.time()

    pts = expr.rand(N_PTS, N_DIM).force()
    k = KMeans(N_CENTERS, ITER)
    k.fit(pts)
  
    cost = time.time() - start
    self._verify_cost("kmeans", cost)
Esempio n. 5
0
  def test_kmeans(self):
    _skip_if_travis()
    N_PTS = 1000 * 1000 * self.ctx.num_workers
    ITER = 5
    N_DIM = 10
    N_CENTERS = 10

    start = time.time()

    pts = expr.rand(N_PTS, N_DIM).evaluate()
    k = KMeans(N_CENTERS, ITER)
    k.fit(pts)

    cost = time.time() - start
    self._verify_cost("kmeans", cost)
Esempio n. 6
0
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'):
    '''
  clustering data points using kmeans spectral clustering method.

  Args:
    points(Expr or DistArray): the data points to be clustered.
    k(int): the number of clusters we need to generate.
    num_iter(int): the max number of iterations that kmeans clustering method runs. 
    similarity_measurement(str): distance method used to measure similarity between two points.
  '''
    # calculate similarity for each pair of points to generate the adjacency matrix A
    A = expr.shuffle(points,
                     _row_similarity_mapper,
                     kw={'similarity_measurement': similarity_measurement},
                     shape_hint=(points.shape[0], points.shape[0]))

    num_dims = A.shape[1]

    # Construct the diagonal matrix D
    D = expr.sum(A, axis=1, tile_hint=(A.shape[0], ))

    # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
    L = expr.shuffle(A, _laplacian_mapper, kw={'D': D}, shape_hint=A.shape)

    # Perform eigen-decomposition using Lanczos solver
    overshoot = min(k * 2, num_dims)
    d, U = lanczos.solve(L, L, overshoot, True)
    U = U[:, 0:k]

    # Generate initial clusters which picks rows as centers if that row contains max eigen
    # value in that column
    init_clusters = U[np.argmax(U, axis=0)]

    # Run kmeans clustering with init_clusters
    kmeans = KMeans(k, num_iter)
    U = expr.from_numpy(U)
    centers, labels = kmeans.fit(U, init_clusters)

    return labels
Esempio n. 7
0
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'):
  '''
  clustering data points using kmeans spectral clustering method.

  Args:
    points(Expr or DistArray): the data points to be clustered.
    k(int): the number of clusters we need to generate.
    num_iter(int): the max number of iterations that kmeans clustering method runs. 
    similarity_measurement(str): distance method used to measure similarity between two points.
  '''  
  # calculate similarity for each pair of points to generate the adjacency matrix A
  A = expr.shuffle(points, _row_similarity_mapper, kw={'similarity_measurement': similarity_measurement})
  
  num_dims = A.shape[1]
  
  # Construct the diagonal matrix D
  D = expr.sum(A, axis=1, tile_hint=(A.shape[0],))
  
  # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
  L = expr.shuffle(A, _laplacian_mapper, kw={'D': D})
  
  # Perform eigen-decomposition using Lanczos solver
  overshoot = min(k * 2, num_dims) 
  d, U = lanczos.solve(L, L, overshoot, True)
  U = U[:, 0:k]
  
  # Generate initial clusters which picks rows as centers if that row contains max eigen 
  # value in that column
  init_clusters = U[np.argmax(U, axis=0)]
  
  # Run kmeans clustering with init_clusters
  kmeans = KMeans(k, num_iter)
  U = expr.from_numpy(U)
  centers, labels = kmeans.fit(U, init_clusters)
  
  return labels
Esempio n. 8
0
 def test_kmeans_expr(self):
     FLAGS.opt_parakeet_gen = 0
     pts = expr.rand(N_PTS, N_DIM)
     k = KMeans(N_CENTERS, ITER)
     k.fit(pts)
Esempio n. 9
0
 def test_kmeans_expr(self):
   FLAGS.opt_parakeet_gen = 0
   pts = expr.rand(N_PTS, N_DIM)
   k = KMeans(N_CENTERS, ITER)
   k.fit(pts)