Exemple #1
0
def benchmark_linear_regression(ctx, timer):
    N_EXAMPLES = 10 * 1000 * 1000 * ctx.num_workers
    N_DIM = 10
    x = expr.rand(N_EXAMPLES,
                  N_DIM,
                  tile_hint=(N_EXAMPLES / ctx.num_workers,
                             N_DIM)).astype(np.float32)

    y = expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / ctx.num_workers,
                                            1)).astype(np.float32)

    w = np.random.rand(N_DIM, 1).astype(np.float32)

    x = expr.eager(x)
    y = expr.eager(y)

    def _step():
        yp = expr.dot(x, w)
        Assert.all_eq(yp.shape, y.shape)

        diff = x * (yp - y)
        grad = expr.sum(diff, axis=0).glom().reshape((N_DIM, 1))
        wprime = w - grad * 1e-6
        wprime.evaluate()

    for i in range(25):
        timer.time_op('linear-regression', _step)
Exemple #2
0
def benchmark_linear_regression(ctx, timer):
  N_EXAMPLES = 10 * 1000 * 1000 * ctx.num_workers
  N_DIM = 10
  x = expr.rand(N_EXAMPLES, N_DIM, 
                tile_hint=(N_EXAMPLES / ctx.num_workers, N_DIM)).astype(np.float32)
  
  y = expr.rand(N_EXAMPLES, 1, 
                tile_hint=(N_EXAMPLES / ctx.num_workers, 1)).astype(np.float32)
  
  w = np.random.rand(N_DIM, 1).astype(np.float32)
  
  x = expr.eager(x)
  y = expr.eager(y)
 
  def _step():
    yp = expr.dot(x, w)
    Assert.all_eq(yp.shape, y.shape)
    
    diff = x * (yp - y)
    grad = expr.sum(diff, axis=0).glom().reshape((N_DIM, 1))
    wprime = w - grad * 1e-6
    expr.force(wprime)

  for i in range(25):
    timer.time_op('linear-regression', _step)
Exemple #3
0
 def test_knn(self):
   ctx = spartan.blob_ctx.get()
   N_QUERY = ctx.num_workers * 2
   N_DIM = ctx.num_workers * 2
   X = expr.rand(N_SAMPLES, N_DIM)
   Y = expr.rand(N_QUERY, N_DIM)
   #dist, ind =  SKNN().fit(X).kneighbors(Y)
   dist2, ind2 = NearestNeighbors().fit(X).kneighbors(Y)
Exemple #4
0
def benchmark_lreg(ctx, timer):
  print "#worker:", ctx.num_workers
  FLAGS.opt_parakeet_gen = 0
  N_EXAMPLES = 4000000 * ctx.num_workers
  #N_EXAMPLES = 5000000 * 64
  x = expr.rand(N_EXAMPLES, N_DIM)
  y = expr.rand(N_EXAMPLES, 1)
  start = time.time()
  linear_regression.linear_regression(x, y, ITERATION)
  total = time.time() - start
  util.log_warn("time cost : %s s" % (total*1.0/ITERATION,))
def benchmark_ridgereg(ctx, timer):
    print "#worker:", ctx.num_workers
    #N_EXAMPLES = 100000000 * ctx.num_workers
    N_EXAMPLES = 90000000 * ctx.num_workers
    x = expr.rand(N_EXAMPLES, N_DIM)
    y = expr.rand(N_EXAMPLES, 1)
    start = time.time()
    ridge_regression.ridge_regression(x, y, 1, ITERATION)

    total = time.time() - start
    util.log_warn("time cost : %s s" % (total * 1.0 / ITERATION, ))
Exemple #6
0
def benchmark_logreg(ctx, timer):
  print "#worker:", ctx.num_workers
  #N_EXAMPLES = 40000000 * ctx.num_workers
  N_EXAMPLES = 5000000 * 64
  x = expr.eager(expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / ctx.num_workers, N_DIM)))
  y = expr.eager(expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / ctx.num_workers, 1)))
  start = time.time()
  logistic_regression.logistic_regression(x, y, ITERATION)

  total = time.time() - start
  util.log_warn("time cost : %s s" % (total*1.0/ITERATION,))
Exemple #7
0
def benchmark_ridgereg(ctx, timer):
  print "#worker:", ctx.num_workers
  #N_EXAMPLES = 100000000 * ctx.num_workers
  N_EXAMPLES = 90000000 * ctx.num_workers
  x = expr.rand(N_EXAMPLES, N_DIM)
  y = expr.rand(N_EXAMPLES, 1)
  start = time.time() 
  ridge_regression.ridge_regression(x, y, 1, ITERATION)
  
  total = time.time() - start
  util.log_warn("time cost : %s s" % (total*1.0/ITERATION,))
Exemple #8
0
def benchmark_knn(ctx, timer):
  print "#worker:", ctx.num_workers
  N_SAMPLES = ctx.num_workers * 300
  N_QUERY = ctx.num_workers * 2
  N_DIM = ctx.num_workers * 2
  X = expr.rand(N_SAMPLES, N_DIM)
  Y = expr.rand(N_QUERY, N_DIM)
  
  t1 = datetime.now()
  dist2, ind2 = NearestNeighbors().fit(X).kneighbors(Y)
  t2 = datetime.now()
  cost_time = millis(t1, t2)
  print "total cost time:%s ms" % (cost_time)
Exemple #9
0
def random_galaxy(n):
    '''Generate a galaxy of random bodies.'''
    dtype = np.float  # consistent with sp.rand, same as np.float64

    galaxy = {  # All bodies stand still initially.
        'm': (rand(n) + dtype(10)) * dtype(m_sol / 10),
        'x': (rand(n) - dtype(0.5)) * dtype(r_ly / 100),
        'y': (rand(n) - dtype(0.5)) * dtype(r_ly / 100),
        'z': (rand(n) - dtype(0.5)) * dtype(r_ly / 100),
        'vx': zeros((n, )),
        'vy': zeros((n, )),
        'vz': zeros((n, ))
    }
    return galaxy
Exemple #10
0
def random_galaxy(n):
  '''Generate a galaxy of random bodies.'''
  dtype = np.float  # consistent with sp.rand, same as np.float64

  galaxy = {  # All bodies stand still initially.
      'm': (rand(n) + dtype(10)) * dtype(m_sol/10),
      'x': (rand(n) - dtype(0.5)) * dtype(r_ly/100),
      'y': (rand(n) - dtype(0.5)) * dtype(r_ly/100),
      'z': (rand(n) - dtype(0.5)) * dtype(r_ly/100),
      'vx': zeros((n, )),
      'vy': zeros((n, )),
      'vz': zeros((n, ))
      }
  return galaxy
Exemple #11
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
  '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
  points = points.evaluate()
  num_dim = points.shape[1]
  if centers is None:
      centers = expr.rand(k, num_dim)

  #labels = expr.zeros((points.shape[0],), dtype=np.int)

  for iter in range(num_iter):
    centers = centers.glom()
    fuzzy = expr.map2(points, 0, fn=kmeans_map2_dist_mapper,
                      fn_kw={"centers": centers, "m": m},
                      shape=(points.shape[0], centers.shape[0]))
    labels = expr.argmax(fuzzy, axis=1)
    new_centers = expr.map2((points, fuzzy), (0, 0), fn=kmeans_map2_center_mapper,
                            fn_kw={"centers": centers, "m": m},
                            shape=(centers.shape[0], centers.shape[1]), reducer=np.add)
    new_centers /= expr.sum(fuzzy ** m, axis=0)[:, expr.newaxis]
    centers = new_centers
  return labels
Exemple #12
0
def benchmark_cg(ctx, timer):
  print "#worker:", ctx.num_workers
  l = int(math.sqrt(ctx.num_workers))
  n = 2000 * 16
  #n = 4000 * l
  la = 20
  niter = 5
  tile_hint = (n, n/ctx.num_workers)
  
  #nonzer = 7
  #nz = n * (nonzer + 1) * (nonzer + 1) + n * (nonzer + 2)
  #density = 0.5 * nz/(n*n)
  A = expr.rand(n, n, tile_hint=tile_hint)
  A = (A + expr.transpose(A))*0.5
  
  I = expr.sparse_diagonal((n,n), tile_hint=tile_hint) * la
  I.force()
  A = expr.eager(A - I)

  #x1 = numpy_cg(A.glom(), niter)
  util.log_warn('begin cg!')
  t1 = datetime.now()
  x2 = conj_gradient(A, niter).force()
  t2 = datetime.now()
  cost_time = millis(t1,t2)
  print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/niter)
Exemple #13
0
  def test_kmeans_expr(self):
    ctx = spartan.blob_ctx.get()
    pts = expr.rand(N_PTS, N_DIM,
                  tile_hint=(divup(N_PTS, ctx.num_workers), N_DIM)).force()

    k = KMeans(N_CENTERS, ITER)
    k.fit(pts)
Exemple #14
0
def benchmark_cg(ctx, timer):
    print "#worker:", ctx.num_workers
    l = int(math.sqrt(ctx.num_workers))
    #n = 2000 * 16
    n = 500 * ctx.num_workers
    la = 20
    niter = 5

    #nonzer = 7
    #nz = n * (nonzer + 1) * (nonzer + 1) + n * (nonzer + 2)
    #density = 0.5 * nz/(n*n)
    A = expr.rand(n, n)
    A = (A + expr.transpose(A)) * 0.5

    I = expr.sparse_diagonal((n, n)) * la
    A = A - I

    #x1 = numpy_cg(A.glom(), niter)
    util.log_warn('begin cg!')
    t1 = datetime.now()
    x2 = conj_gradient(A, niter).force()
    t2 = datetime.now()
    cost_time = millis(t1, t2)
    print "total cost time:%s ms, per iter cost time:%s ms" % (
        cost_time, cost_time / niter)
Exemple #15
0
  def test_matrix_mult(self):
    _skip_if_travis()
    N_POINTS = 2000
    x = expr.rand(N_POINTS, N_POINTS, tile_hint=(N_POINTS, N_POINTS/ self.ctx.num_workers)).astype(np.float32)
    y = expr.rand(N_POINTS, N_POINTS, tile_hint=(N_POINTS / self.ctx.num_workers, N_POINTS)).astype(np.float32)
    
    x = expr.eager(x)
    y = expr.eager(y)

    start = time.time()

    for i in range(5):
      res = expr.dot(x, y, tile_hint=(N_POINTS, N_POINTS/ self.ctx.num_workers))
      res.force()

    cost = time.time() - start
    self._verify_cost("matrix_mult", cost)
Exemple #16
0
  def test_matrix_mult(self):
    _skip_if_travis()
    N_POINTS = 2000
    x = expr.rand(N_POINTS, N_POINTS, tile_hint=(N_POINTS, N_POINTS / self.ctx.num_workers)).astype(np.float32)
    y = expr.rand(N_POINTS, N_POINTS, tile_hint=(N_POINTS / self.ctx.num_workers, N_POINTS)).astype(np.float32)

    x = expr.eager(x)
    y = expr.eager(y)

    start = time.time()

    for i in range(5):
      res = expr.dot(x, y, tile_hint=(N_POINTS, N_POINTS / self.ctx.num_workers))
      res.evaluate()

    cost = time.time() - start
    self._verify_cost("matrix_mult", cost)
Exemple #17
0
def learn_topics(terms_docs_matrix, k_topics, alpha=0.1, eta=0.1, max_iter=10, max_iter_per_doc=1):
    """
  Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model.

  Args:
    terms_docs_matrix(Expr or DistArray): the count of each term in each document.
    k_topics: the number of topics we need to find.
    alpha(float): parameter of LDA model.
    eta(float): parameter of LDA model.
    max_iter(int):the max iterations to train LDA topic model.
    max_iter_per_doc: the max iterations to train each document.
  """
    num_terms = terms_docs_matrix.shape[0]
    num_docs = terms_docs_matrix.shape[1]

    topic_term_counts = expr.rand(k_topics, num_terms)
    for i in range(max_iter):
        # topic_term_counts = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)),
        # _lda_mapper,
        # target=expr.ndarray((k_topics, num_terms), dtype=np.float64, reduce_fn=np.add),
        # kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc,
        #'topic_term_counts': topic_term_counts}).optimized()
        topic_term_counts = expr.outer(
            (terms_docs_matrix, topic_term_counts),
            (1, None),
            fn=_lda_mapper,
            fn_kw={"k_topics": k_topics, "alpha": alpha, "eta": eta, "max_iter_per_doc": max_iter_per_doc},
            shape=(k_topics, num_terms),
            dtype=np.float64,
            reducer=np.add,
        )
    # calculate the doc-topic inference
    # doc_topics = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)),
    # _lda_doc_topic_mapper,
    # kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc,
    #'topic_term_counts': topic_term_counts},
    # shape_hint=(num_docs, k_topics)).optimized()
    doc_topics = expr.outer(
        (terms_docs_matrix, topic_term_counts),
        (1, None),
        fn=_lda_doc_topic_mapper,
        fn_kw={"k_topics": k_topics, "alpha": alpha, "eta": eta, "max_iter_per_doc": max_iter_per_doc},
        shape=(num_docs, k_topics),
        dtype=np.float64,
    )

    # normalize the topic-term distribution
    norm_val = expr.reduce(
        topic_term_counts,
        axis=1,
        dtype_fn=lambda input: input.dtype,
        local_reduce_fn=lambda ex, data, axis: np.abs(data).sum(axis),
        accumulate_fn=np.add,
    )
    topic_term_counts = topic_term_counts / norm_val.reshape((k_topics, 1))
    topic_term_counts = topic_term_counts.optimized()
    return doc_topics, topic_term_counts
Exemple #18
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
  '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
  points = expr.force(points)
  num_dim = points.shape[1]
  if centers is None:
      centers = expr.rand(k, num_dim)

  labels = expr.zeros((points.shape[0],), dtype=np.int)

  for iter in range(num_iter):
    centers = expr.as_array(centers)
    points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1]))
    centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1]))
    distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2)
    # This is used to avoid dividing zero
    distances = distances + 0.00000000001
    util.log_info('distances shape %s' % str(distances.shape))
    distances_broadcast = expr.reshape(distances, (distances.shape[0], 1,
                                                   distances.shape[1]))
    distances_broadcast2 = expr.reshape(distances, (distances.shape[0],
                                                    distances.shape[1], 1))
    prob = 1.0 / expr.sum(expr.power(distances_broadcast / distances_broadcast2,
                                     2.0 / (m - 1)), axis=2)
    prob.force()
    counts = expr.sum(prob, axis=0)
    counts = expr.reshape(counts, (counts.shape[0], 1))
    labels = expr.argmax(prob, axis=1)
    centers = expr.sum(expr.reshape(points, (points.shape[0], 1, points.shape[1])) *
                       expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)),
                       axis=0)

    # We assume that the size of centers are relative small that can be handled
    # on the master.
    counts = counts.glom()
    centers = centers.glom()
    # If any centroids don't have any points assigned to them.
    zcount_indices = (counts == 0).reshape(k)

    if np.any(zcount_indices):
      # One or more centroids may not have any points assigned to them, which results in their
      # position being the zero-vector.  We reseed these centroids with new random values
      # and set their counts to 1 in order to get rid of dividing by zero.
      counts[zcount_indices, :] = 1
      centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices),
                                                  num_dim)

    centers = centers / counts
  return labels
def benchmark_canopy_clustering(ctx, timer):
    # N_PTS = 60000 * ctx.num_workers
    N_PTS = 30000 * 64
    N_DIM = 2

    pts = expr.rand(N_PTS, N_DIM, tile_hint=(N_PTS / ctx.num_workers, N_DIM)).evaluate()

    t1 = datetime.now()
    cluster_result = canopy_cluster(pts).evaluate()
    t2 = datetime.now()
    print "canopy_cluster time:%s ms" % millis(t1, t2)
def benchmark_linear_regression(ctx, timer):
    N_EXAMPLES = 65536
    N_DIM = 1
    x = expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / N_TILES, N_DIM)).astype(np.float32)

    x = expr.eager(x)

    def _step():
        y = expr.force(x * x)

    for i in range(25):
        _step()
Exemple #21
0
def benchmark_canopy_clustering(ctx, timer):
  #N_PTS = 60000 * ctx.num_workers
  N_PTS = 30000 * 64
  N_DIM = 2

  pts = expr.rand(N_PTS, N_DIM,
                  tile_hint=(N_PTS / ctx.num_workers, N_DIM)).force()

  t1 = datetime.now()
  cluster_result = canopy_cluster(pts).force()
  t2 = datetime.now()
  print 'canopy_cluster time:%s ms' % millis(t1, t2)
def benchmark_linear_regression(ctx, timer):
    N_EXAMPLES = 65536
    N_DIM = 1
    x = expr.rand(N_EXAMPLES, N_DIM,
                  tile_hint=(N_EXAMPLES / N_TILES, N_DIM)).astype(np.float32)

    x = expr.eager(x)

    def _step():
        y = expr.force(x * x)

    for i in range(25):
        _step()
Exemple #23
0
def benchmark_kmeans(ctx, timer):
  print "#worker:", ctx.num_workers
  N_PTS = 1000 * 256
  N_CENTERS = 10
  N_DIM = 512
  ITER = 1
  pts = expr.rand(N_PTS, N_DIM)
  k = KMeans(N_CENTERS, ITER)
  t1 = datetime.now()
  k.fit(pts)
  t2 = datetime.now()
  cost_time = millis(t1, t2)
  print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/ITER)
Exemple #24
0
def benchmark_fuzzy_kmeans(ctx, timer):
  #N_PTS = 40000 * ctx.num_workers
  N_PTS = 1000 * 256
  N_DIM = 512
  ITER = 5
  N_CENTERS = 10

  pts = expr.rand(N_PTS, N_DIM)

  t1 = datetime.now()
  cluster_result = fuzzy_kmeans(pts, k=N_CENTERS, num_iter=ITER).evaluate()
  t2 = datetime.now()
  time_cost = millis(t1, t2)
  print 'fuzzy_cluster time:%s ms, per_iter:%s ms' % (time_cost, time_cost/ITER)
Exemple #25
0
def benchmark_spectral_clustering(ctx, timer):
    #N_PTS = 500 * ctx.num_workers
    N_PTS = 50 * 64
    N_DIM = 2
    ITER = 5
    N_CENTERS = 5

    pts = expr.rand(N_PTS, N_DIM,
                    tile_hint=(N_PTS / ctx.num_workers, N_DIM)).force()

    t1 = datetime.now()
    cluster_result = spectral_cluster(pts, N_CENTERS, ITER).glom()
    t2 = datetime.now()
    print 'spectral_cluster time:%s ms' % millis(t1, t2)
def benchmark_spectral_clustering(ctx, timer):
  #N_PTS = 500 * ctx.num_workers
  N_PTS = 50 * 64
  N_DIM = 2
  ITER = 5
  N_CENTERS = 5

  pts = expr.rand(N_PTS, N_DIM,
                  tile_hint=(N_PTS / ctx.num_workers, N_DIM)).evaluate()

  t1 = datetime.now()
  cluster_result = spectral_cluster(pts, N_CENTERS, ITER).glom()
  t2 = datetime.now()
  print 'spectral_cluster time:%s ms' % millis(t1, t2)
Exemple #27
0
def benchmark_kmeans(ctx, timer):
    print "#worker:", ctx.num_workers
    N_PTS = 1000 * 256
    N_CENTERS = 10
    N_DIM = 512
    ITER = 1
    pts = expr.rand(N_PTS, N_DIM)
    k = KMeans(N_CENTERS, ITER)
    t1 = datetime.now()
    k.fit(pts)
    t2 = datetime.now()
    cost_time = millis(t1, t2)
    print "total cost time:%s ms, per iter cost time:%s ms" % (
        cost_time, cost_time / ITER)
def benchmark_fuzzy_kmeans(ctx, timer):
    # N_PTS = 40000 * ctx.num_workers
    N_PTS = 1000 * 256
    N_DIM = 512
    ITER = 5
    N_CENTERS = 10

    pts = expr.rand(N_PTS, N_DIM)

    t1 = datetime.now()
    cluster_result = fuzzy_kmeans(pts, k=N_CENTERS, num_iter=ITER).evaluate()
    t2 = datetime.now()
    time_cost = millis(t1, t2)
    print "fuzzy_cluster time:%s ms, per_iter:%s ms" % (time_cost, time_cost / ITER)
Exemple #29
0
  def test_linear_reg(self):
    _skip_if_travis()
    N_EXAMPLES = 10 * 1000 * 1000 * self.ctx.num_workers
    N_DIM = 10
    x = expr.rand(N_EXAMPLES, N_DIM,
                  tile_hint=(N_EXAMPLES / self.ctx.num_workers, N_DIM)).astype(np.float32)

    y = expr.rand(N_EXAMPLES, 1,
                  tile_hint=(N_EXAMPLES / self.ctx.num_workers, 1)).astype(np.float32)

    w = np.random.rand(N_DIM, 1).astype(np.float32)
    x = expr.eager(x)
    y = expr.eager(y)

    start = time.time()

    for i in range(5):
      yp = expr.dot(x, w)
      diff = x * (yp - y)
      grad = expr.sum(diff, axis=0, tile_hint=[N_DIM]).glom().reshape((N_DIM, 1))
      w = w - grad * 1e-6

    cost = time.time() - start
    self._verify_cost("linear_reg", cost)
Exemple #30
0
  def test_linear_reg(self):
    _skip_if_travis()
    N_EXAMPLES =  10 * 1000 * 1000 * self.ctx.num_workers
    N_DIM = 10
    x = expr.rand(N_EXAMPLES, N_DIM, 
                  tile_hint=(N_EXAMPLES / self.ctx.num_workers, N_DIM)).astype(np.float32)
    
    y = expr.rand(N_EXAMPLES, 1, 
                  tile_hint=(N_EXAMPLES / self.ctx.num_workers, 1)).astype(np.float32)
    
    w = np.random.rand(N_DIM, 1).astype(np.float32)
    x = expr.eager(x)
    y = expr.eager(y)
    
    start = time.time()

    for i in range(5):
      yp = expr.dot(x, w)
      diff = x * (yp - y)
      grad = expr.sum(diff, axis=0, tile_hint=[N_DIM]).glom().reshape((N_DIM, 1))
      w = w - grad * 1e-6

    cost = time.time() - start
    self._verify_cost("linear_reg", cost)
Exemple #31
0
  def test_kmeans(self):
    _skip_if_travis()
    N_PTS = 1000 * 1000 * self.ctx.num_workers
    ITER = 5
    N_DIM = 10
    N_CENTERS = 10

    start = time.time()

    pts = expr.rand(N_PTS, N_DIM).evaluate()
    k = KMeans(N_CENTERS, ITER)
    k.fit(pts)

    cost = time.time() - start
    self._verify_cost("kmeans", cost)
def benchmark_fuzzy_kmeans(ctx, timer):
  #N_PTS = 40000 * ctx.num_workers
  N_PTS = 20000 * 64
  N_DIM = 2
  ITER = 5
  N_CENTERS = 10
  
  pts = expr.rand(N_PTS, N_DIM,
                  tile_hint=(N_PTS / ctx.num_workers, N_DIM)).force()

  t1 = datetime.now()
  cluster_result = fuzzy_kmeans(pts, k=N_CENTERS, num_iter=ITER).force()
  t2 = datetime.now()
  time_cost = millis(t1, t2)
  print 'fuzzy_cluster time:%s ms, per_iter:%s ms' % (time_cost, time_cost/ITER)
Exemple #33
0
  def test_kmeans(self):
    _skip_if_travis()
    N_PTS = 1000 * 1000 * self.ctx.num_workers
    ITER = 5
    N_DIM = 10
    N_CENTERS = 10
    
    start = time.time()

    pts = expr.rand(N_PTS, N_DIM).force()
    k = KMeans(N_CENTERS, ITER)
    k.fit(pts)
  
    cost = time.time() - start
    self._verify_cost("kmeans", cost)
def benchmark_streaming_kmeans(ctx, timer):
  #N_PTS = 100 * ctx.num_workers
  N_PTS = 100 * 64
  N_DIM = 2
  N_CENTERS = 5
  
  pts = expr.rand(N_PTS, N_DIM,
                  tile_hint=(N_PTS / ctx.num_workers, N_DIM)).force()

  print pts.glom()
  t1 = datetime.now()
  cluster_result = streaming_kmeans(pts, k=N_CENTERS).glom()
  t2 = datetime.now()
  #print cluster_result.glom()
  time_cost = millis(t1, t2)
  print 'streaming_kmeans_cluster time:%s ms' % time_cost
def benchmark_streaming_kmeans(ctx, timer):
    #N_PTS = 100 * ctx.num_workers
    N_PTS = 100 * 64
    N_DIM = 2
    N_CENTERS = 5

    pts = expr.rand(N_PTS, N_DIM,
                    tile_hint=(N_PTS / ctx.num_workers, N_DIM)).evaluate()

    print pts.glom()
    t1 = datetime.now()
    cluster_result = streaming_kmeans(pts, k=N_CENTERS).glom()
    t2 = datetime.now()
    #print cluster_result.glom()
    time_cost = millis(t1, t2)
    print 'streaming_kmeans_cluster time:%s ms' % time_cost
Exemple #36
0
def benchmark_pr(ctx, timer):
    num_pages = 300 * 1000 * 3 * ctx.num_workers
    num_outlinks = 10
    density = num_outlinks * 1.0 / num_pages
    same_site_prob = 0.9
    print "#worker:", ctx.num_workers
    col_step = util.divup(num_pages, ctx.num_workers)

    wts_tile_hint = [num_pages, col_step]
    p_tile_hint = [col_step, 1]
    #wts = expr.sparse_diagonal((num_pages, num_pages), dtype=np.float32, tile_hint=wts_tile_hint)
    #wts = expr.eager(
    #         expr.sparse_rand((num_pages, num_pages),
    #                          density=density,
    #                          format='csr',
    #                          dtype=np.float32,
    #                          tile_hint=wts_tile_hint))

    wts = pagerank_sparse(num_pages, num_outlinks, same_site_prob)
    #res = wts.glom().todense()
    #for i in range(res.shape[0]):
    #  l = []
    #  for j in range(res.shape[1]):
    #    l.append(round(res[i,j],1))
    #  print l
    #p = expr.sparse_empty((num_pages,1), dtype=np.float32, tile_hint=p_tile_hint).evaluate()
    #for i in range(num_pages):
    #  p[i,0] = 1
    #p = expr.sparse_rand((num_pages, 1), density=1.0, format='csc', dtype=np.float32, tile_hint=p_tile_hint)
    p = expr.rand(num_pages, 1).astype(np.float32)
    #q = expr.zeros((num_pages, 1), dtype=np.float32, tile_hint=p_tile_hint).evaluate()
    #q[:] = p.glom().todense()
    #q = expr.lazify(q)

    #r = expr.dot(wts, p)
    #print r.glom()
    t1 = datetime.now()
    sparse_multiply(wts, p, p_tile_hint)
    t2 = datetime.now()
    cost_time = millis(t1, t2)
    print 'current benchmark:', cost_time / num_iter / 1000
Exemple #37
0
def benchmark_pr(ctx, timer):
  num_pages = 300 * 1000 * 3 * ctx.num_workers
  num_outlinks = 10
  density = num_outlinks * 1.0 / num_pages
  same_site_prob = 0.9
  print "#worker:", ctx.num_workers
  col_step = util.divup(num_pages, ctx.num_workers)

  wts_tile_hint = [num_pages, col_step]
  p_tile_hint = [col_step, 1]
  #wts = expr.sparse_diagonal((num_pages, num_pages), dtype=np.float32, tile_hint=wts_tile_hint)
  #wts = expr.eager(
  #         expr.sparse_rand((num_pages, num_pages),
  #                          density=density,
  #                          format='csr',
  #                          dtype=np.float32,
  #                          tile_hint=wts_tile_hint))

  wts = pagerank_sparse(num_pages, num_outlinks, same_site_prob)
  #res = wts.glom().todense()
  #for i in range(res.shape[0]):
  #  l = []
  #  for j in range(res.shape[1]):
  #    l.append(round(res[i,j],1))
  #  print l
  #p = expr.sparse_empty((num_pages,1), dtype=np.float32, tile_hint=p_tile_hint).evaluate()
  #for i in range(num_pages):
  #  p[i,0] = 1
  #p = expr.sparse_rand((num_pages, 1), density=1.0, format='csc', dtype=np.float32, tile_hint=p_tile_hint)
  p = expr.rand(num_pages, 1).astype(np.float32)
  #q = expr.zeros((num_pages, 1), dtype=np.float32, tile_hint=p_tile_hint).evaluate()
  #q[:] = p.glom().todense()
  #q = expr.lazify(q)

  #r = expr.dot(wts, p)
  #print r.glom()
  t1 = datetime.now()
  sparse_multiply(wts, p, p_tile_hint)
  t2 = datetime.now()
  cost_time = millis(t1, t2)
  print 'current benchmark:', cost_time / num_iter / 1000
Exemple #38
0
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10, M=None):
  '''
  compute the factorization A = U M' using the alternating least-squares (ALS) method.

  where `A` is the "ratings" matrix which maps from a user and item to a rating score,
        `U` and `M` are the factor matrices, which represent user and item preferences.
  Args:
    A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score.
    la(float): the parameter of the als.
    alpha(int): confidence parameter used on implicit feedback.
    implicit_feedback(bool): whether using implicit_feedback method for als.
    num_features(int): dimension of the feature space.
    num_iter(int): max iteration to run.
  '''
  num_users = A.shape[0]
  num_items = A.shape[1]

  AT = expr.transpose(A)

  avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0)

  M = expr.rand(num_items, num_features)
  M = expr.assign(M, np.s_[:, 0], avg_rating.reshape((avg_rating.shape[0], 1)))

  #A = expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0))
  #AT = expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0))
  for i in range(num_iter):
    # Recomputing U
    shape = (num_users, num_features)
    U = expr.outer((A, M), (0, None), fn=_solve_U_or_M_mapper,
                   fn_kw={'la': la, 'alpha': alpha,
                          'implicit_feedback': implicit_feedback, 'shape': shape},
                   shape=shape, dtype=np.float)
    # Recomputing M
    shape = (num_items, num_features)
    M = expr.outer((AT, U), (0, None), fn=_solve_U_or_M_mapper,
                   fn_kw={'la': la, 'alpha': alpha,
                          'implicit_feedback': implicit_feedback, 'shape': shape},
                   shape=shape, dtype=np.float)
  return U, M
Exemple #39
0
def learn_topics(terms_docs_matrix, k_topics, alpha=0.1, eta=0.1, max_iter=10, max_iter_per_doc=1):
  '''
  Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model.

  Args:
    terms_docs_matrix(Expr or DistArray): the count of each term in each document.
    k_topics: the number of topics we need to find.
    alpha(float): parameter of LDA model.
    eta(float): parameter of LDA model.
    max_iter(int):the max iterations to train LDA topic model.
    max_iter_per_doc: the max iterations to train each document.
  '''
  topic_term_counts = expr.rand(k_topics, terms_docs_matrix.shape[0], 
                                tile_hint=(k_topics, terms_docs_matrix.shape[0]))

  for i in range(max_iter):
    new_topic_term_counts = expr.ndarray((k_topics, terms_docs_matrix.shape[0]), 
                                         dtype=np.float64, 
                                         reduce_fn=np.add, 
                                         tile_hint=(k_topics, terms_docs_matrix.shape[0]))
    topic_term_counts = expr.shuffle(terms_docs_matrix, _lda_mapper, target=new_topic_term_counts, 
                                     kw={'k_topics': k_topics, 'alpha': alpha, 'eta':eta, 
                                         'max_iter_per_doc': max_iter_per_doc, 
                                         'topic_term_counts': topic_term_counts})
    
  # calculate the doc-topic inference
  doc_topics = expr.shuffle(terms_docs_matrix, _lda_doc_topic_mapper, 
                            kw={'k_topics': k_topics, 'alpha': alpha, 'eta':eta, 
                                'max_iter_per_doc': max_iter_per_doc, 
                                'topic_term_counts': topic_term_counts})
  
  # normalize the topic-term distribution  
  norm_val = expr.reduce(topic_term_counts, axis=1, 
                         dtype_fn=lambda input: input.dtype, 
                         local_reduce_fn=lambda ex, data, axis:np.abs(data).sum(axis), 
                         accumulate_fn=np.add)
  topic_term_counts = topic_term_counts / norm_val.reshape((topic_term_counts.shape[0], 1))

  return doc_topics, topic_term_counts
Exemple #40
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
  '''
  clustering data points using fuzzy kmeans clustering method.
  
  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans. 
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
  points = expr.force(points)
  num_dim = points.shape[1]
  if centers is None:
      centers = expr.rand(k, num_dim, tile_hint=(k, num_dim))
  
  labels = expr.zeros((points.shape[0],), dtype=np.int, tile_hint=(points.shape[0]/len(points.tiles),))
  for iter in range(num_iter):
    new_centers = expr.ndarray((k, num_dim), reduce_fn=lambda a, b: a + b, tile_hint=(k, num_dim))
    new_counts = expr.ndarray((k, 1), dtype=np.float, reduce_fn=lambda a, b: a + b, tile_hint=(k, 1))
    expr.shuffle(points, _fuzzy_kmeans_mapper, kw={'old_centers': centers, 
                                                   'centers': new_centers, 
                                                   'counts': new_counts, 
                                                   'labels': labels, 
                                                   'm': m}).force()
    
    # If any centroids don't have any points assigned to them.
    zcount_indices = (new_counts.glom() == 0).reshape(k)
      
    if np.any(zcount_indices):
      # One or more centroids may not have any points assigned to them, which results in their
      # position being the zero-vector.  We reseed these centroids with new random values
      # and set their counts to 1 in order to get rid of dividing by zero.
      new_counts[zcount_indices, :] = 1
      new_centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices), num_dim)
        
    centers = new_centers / new_counts
    
  return labels
Exemple #41
0
def als(A,
        la=0.065,
        alpha=40,
        implicit_feedback=False,
        num_features=20,
        num_iter=10,
        M=None):
    '''
  compute the factorization A = U M' using the alternating least-squares (ALS) method.

  where `A` is the "ratings" matrix which maps from a user and item to a rating score,
        `U` and `M` are the factor matrices, which represent user and item preferences.
  Args:
    A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score.
    la(float): the parameter of the als.
    alpha(int): confidence parameter used on implicit feedback.
    implicit_feedback(bool): whether using implicit_feedback method for als.
    num_features(int): dimension of the feature space.
    num_iter(int): max iteration to run.
  '''
    num_users = A.shape[0]
    num_items = A.shape[1]

    AT = expr.transpose(A)

    avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0)

    M = expr.rand(num_items, num_features)
    M = expr.assign(M, np.s_[:, 0], avg_rating.reshape(
        (avg_rating.shape[0], 1)))

    #A = expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0))
    #AT = expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0))
    for i in range(num_iter):
        # Recomputing U
        shape = (num_users, num_features)
        U = expr.outer(
            (A, M), (0, None),
            fn=_solve_U_or_M_mapper,
            fn_kw={
                'la': la,
                'alpha': alpha,
                'implicit_feedback': implicit_feedback,
                'shape': shape
            },
            shape=shape,
            dtype=np.float)
        # Recomputing M
        shape = (num_items, num_features)
        M = expr.outer(
            (AT, U), (0, None),
            fn=_solve_U_or_M_mapper,
            fn_kw={
                'la': la,
                'alpha': alpha,
                'implicit_feedback': implicit_feedback,
                'shape': shape
            },
            shape=shape,
            dtype=np.float)
    return U, M
def run(N_EXAMPLES, N_DIM, iterations):
  x = expr.rand(N_EXAMPLES, N_DIM)
  y = expr.rand(N_EXAMPLES, 1)
  logistic_regression(x, y, iterations)
Exemple #43
0
def learn_topics(terms_docs_matrix,
                 k_topics,
                 alpha=0.1,
                 eta=0.1,
                 max_iter=10,
                 max_iter_per_doc=1):
    '''
  Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model.

  Args:
    terms_docs_matrix(Expr or DistArray): the count of each term in each document.
    k_topics: the number of topics we need to find.
    alpha(float): parameter of LDA model.
    eta(float): parameter of LDA model.
    max_iter(int):the max iterations to train LDA topic model.
    max_iter_per_doc: the max iterations to train each document.
  '''
    num_terms = terms_docs_matrix.shape[0]
    num_docs = terms_docs_matrix.shape[1]

    topic_term_counts = expr.rand(k_topics, num_terms)
    for i in range(max_iter):
        #topic_term_counts = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)),
        #_lda_mapper,
        #target=expr.ndarray((k_topics, num_terms), dtype=np.float64, reduce_fn=np.add),
        #kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc,
        #'topic_term_counts': topic_term_counts}).optimized()
        topic_term_counts = expr.outer(
            (terms_docs_matrix, topic_term_counts), (1, None),
            fn=_lda_mapper,
            fn_kw={
                'k_topics': k_topics,
                'alpha': alpha,
                'eta': eta,
                'max_iter_per_doc': max_iter_per_doc
            },
            shape=(k_topics, num_terms),
            dtype=np.float64,
            reducer=np.add)
    # calculate the doc-topic inference
    #doc_topics = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)),
    #_lda_doc_topic_mapper,
    #kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc,
    #'topic_term_counts': topic_term_counts},
    #shape_hint=(num_docs, k_topics)).optimized()
    doc_topics = expr.outer(
        (terms_docs_matrix, topic_term_counts), (1, None),
        fn=_lda_doc_topic_mapper,
        fn_kw={
            'k_topics': k_topics,
            'alpha': alpha,
            'eta': eta,
            'max_iter_per_doc': max_iter_per_doc
        },
        shape=(num_docs, k_topics),
        dtype=np.float64)

    # normalize the topic-term distribution
    norm_val = expr.reduce(
        topic_term_counts,
        axis=1,
        dtype_fn=lambda input: input.dtype,
        local_reduce_fn=lambda ex, data, axis: np.abs(data).sum(axis),
        accumulate_fn=np.add)
    topic_term_counts = topic_term_counts / norm_val.reshape((k_topics, 1))
    return doc_topics, topic_term_counts
def run(N_EXAMPLES, N_DIM, iterations):
  x = expr.eager(expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / 10, 10)))
  y = expr.eager(expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / 10, 1)))
  linear_regression(x, y, iterations)
Exemple #45
0
def benchmark_sort(ctx, timer):
  A = expr.rand(10, 10, 10).force()
  T = expr.sort(A)
  print np.all(np.equal(T.glom(), np.sort(A.glom(), axis=None)))
Exemple #46
0
 def test_kmeans_expr(self):
     FLAGS.opt_parakeet_gen = 0
     pts = expr.rand(N_PTS, N_DIM)
     k = KMeans(N_CENTERS, ITER)
     k.fit(pts)
Exemple #47
0
    def fit(self, X, centers=None, implementation='map2'):
        """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
        num_dim = X.shape[1]
        num_points = X.shape[0]

        labels = expr.zeros((num_points, 1), dtype=np.int)

        if implementation == 'map2':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.map2(X,
                                   0,
                                   fn=kmeans_map2_dist_mapper,
                                   fn_kw={"centers": centers},
                                   shape=(X.shape[0], ))

                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
            return centers, labels

        elif implementation == 'outer':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.outer((X, centers), (0, None),
                                    fn=kmeans_outer_dist_mapper,
                                    shape=(X.shape[0], ))
                #labels = expr.argmin(distances, axis=1)
                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'broadcast':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                util.log_warn("k_means_ %d %d", i, time.time())
                X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
                centers_broadcast = expr.reshape(
                    centers, (1, centers.shape[0], centers.shape[1]))
                distances = expr.sum(expr.square(X_broadcast -
                                                 centers_broadcast),
                                     axis=2)
                labels = expr.argmin(distances, axis=1)
                center_idx = expr.arange((1, centers.shape[0]))
                matches = expr.reshape(labels,
                                       (labels.shape[0], 1)) == center_idx
                matches = matches.astype(np.int64)
                counts = expr.sum(matches, axis=0)
                centers = expr.sum(
                    X_broadcast *
                    expr.reshape(matches,
                                 (matches.shape[0], matches.shape[1], 1)),
                    axis=0)

                counts = counts.optimized().glom()
                centers = centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'shuffle':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                # Reset them to zero.
                new_centers = expr.ndarray((self.n_clusters, num_dim),
                                           reduce_fn=lambda a, b: a + b)
                new_counts = expr.ndarray((self.n_clusters, 1),
                                          dtype=np.int,
                                          reduce_fn=lambda a, b: a + b)

                _ = expr.shuffle(X,
                                 _find_cluster_mapper,
                                 kw={
                                     'd_pts': X,
                                     'old_centers': centers,
                                     'new_centers': new_centers,
                                     'new_counts': new_counts,
                                     'labels': labels
                                 },
                                 shape_hint=(1, ),
                                 cost_hint={
                                     hash(labels): {
                                         '00': 0,
                                         '01': np.prod(labels.shape)
                                     }
                                 })
                _.force()

                new_counts = new_counts.glom()
                new_centers = new_centers.glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (new_counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    new_counts[zcount_indices] = 1
                    new_centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                new_centers = new_centers / new_counts
                centers = new_centers

            return centers, labels
Exemple #48
0
  def fit(self, X, centers=None, implementation='outer'):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if implementation == 'map2':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers},
                           shape=(X.shape[0], ))

        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
      return centers, labels

    elif implementation == 'outer':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper,
                            shape=(X.shape[0],))
        #labels = expr.argmin(distances, axis=1)
        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'broadcast':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        util.log_warn("k_means_ %d %d", i, time.time())
        X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
        centers_broadcast = expr.reshape(centers, (1, centers.shape[0],
                                                   centers.shape[1]))
        distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
        labels = expr.argmin(distances, axis=1)
        center_idx = expr.arange((1, centers.shape[0]))
        matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
        matches = matches.astype(np.int64)
        counts = expr.sum(matches, axis=0)
        centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                                matches.shape[1], 1)),
                           axis=0)

        counts = counts.optimized().glom()
        centers = centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'shuffle':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        # Reset them to zero.
        new_centers = expr.ndarray((self.n_clusters, num_dim),
                                   reduce_fn=lambda a, b: a + b)
        new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int,
                                  reduce_fn=lambda a, b: a + b)

        _ = expr.shuffle(X,
                         _find_cluster_mapper,
                         kw={'d_pts': X,
                             'old_centers': centers,
                             'new_centers': new_centers,
                             'new_counts': new_counts,
                             'labels': labels},
                         shape_hint=(1,),
                         cost_hint={hash(labels): {'00': 0,
                                                   '01': np.prod(labels.shape)}})
        _.force()

        new_counts = new_counts.glom()
        new_centers = new_centers.glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (new_counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          new_counts[zcount_indices] = 1
          new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        new_centers = new_centers / new_counts
        centers = new_centers

      return centers, labels
def run(N_EXAMPLES, N_DIM, iterations):
    x = expr.rand(N_EXAMPLES, N_DIM)
    y = expr.rand(N_EXAMPLES, 1)
    logistic_regression(x, y, iterations)
Exemple #50
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
    '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
    points = expr.force(points)
    num_dim = points.shape[1]
    if centers is None:
        centers = expr.rand(k, num_dim)

    labels = expr.zeros((points.shape[0], ), dtype=np.int)

    for iter in range(num_iter):
        centers = expr.as_array(centers)
        points_broadcast = expr.reshape(points,
                                        (points.shape[0], 1, points.shape[1]))
        centers_broadcast = expr.reshape(
            centers, (1, centers.shape[0], centers.shape[1]))
        distances = expr.sum(expr.square(points_broadcast - centers_broadcast),
                             axis=2)
        # This is used to avoid dividing zero
        distances = distances + 0.00000000001
        util.log_info('distances shape %s' % str(distances.shape))
        distances_broadcast = expr.reshape(
            distances, (distances.shape[0], 1, distances.shape[1]))
        distances_broadcast2 = expr.reshape(
            distances, (distances.shape[0], distances.shape[1], 1))
        prob = 1.0 / expr.sum(expr.power(
            distances_broadcast / distances_broadcast2, 2.0 / (m - 1)),
                              axis=2)
        prob.force()
        counts = expr.sum(prob, axis=0)
        counts = expr.reshape(counts, (counts.shape[0], 1))
        labels = expr.argmax(prob, axis=1)
        centers = expr.sum(
            expr.reshape(points, (points.shape[0], 1, points.shape[1])) *
            expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)),
            axis=0)

        # We assume that the size of centers are relative small that can be handled
        # on the master.
        counts = counts.glom()
        centers = centers.glom()
        # If any centroids don't have any points assigned to them.
        zcount_indices = (counts == 0).reshape(k)

        if np.any(zcount_indices):
            # One or more centroids may not have any points assigned to them, which results in their
            # position being the zero-vector.  We reseed these centroids with new random values
            # and set their counts to 1 in order to get rid of dividing by zero.
            counts[zcount_indices, :] = 1
            centers[zcount_indices, :] = np.random.rand(
                np.count_nonzero(zcount_indices), num_dim)

        centers = centers / counts
    return labels