Ejemplo n.º 1
0
    def _test_optimization_ordered(self):
        na = np.random.rand(1000, 1000)
        nb = np.random.rand(1000, 1000)
        a = expr.from_numpy(na)
        b = expr.from_numpy(nb)

        c = a - b
        d = a + c
        f = c[200:900, 200:900]
        g = d[200:900, 200:900]
        h = f - g
        i = f + h
        j = h[100:500, 100:500]
        k = i[100:500, 100:500]
        l = expr.dot(j, k)
        m = j + k
        n = k - l
        o = n - m
        q = o[100:200, 100:200]

        nc = na - nb
        nd = na + nc
        nf = nc[200:900, 200:900]
        ng = nd[200:900, 200:900]
        nh = nf - ng
        ni = nf + nh
        nj = nh[100:500, 100:500]
        nk = ni[100:500, 100:500]
        nl = np.dot(nj, nk)
        nm = nj + nk
        nn = nk - nl
        no = nn - nm
        nq = no[100:200, 100:200]

        Assert.all_eq(nq, q.optimized().glom(), tolerance=1e-10)
Ejemplo n.º 2
0
  def test_assign_1d(self):
    b = np.random.randn(100)
    sp_b = from_numpy(b)

    #a[:] = b[:] copy entire array
    a = np.random.randn(100)
    region_a = np.s_[0:100]
    region_b = np.s_[0:100]
    sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
    a[region_a] = b[region_b]
    Assert.all_eq(sp_a, a)

    # a[0] = b[1] copy one value
    a = np.random.randn(100)
    region_a = np.s_[0]
    region_b = np.s_[1]
    sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
    a[region_a] = b[region_b]
    Assert.all_eq(sp_a, a)

    # a[0:10] = b[20:30] copy range of values
    a = np.random.randn(100)
    region_a = np.s_[0:10]
    region_b = np.s_[20:30]
    sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
    a[region_a] = b[region_b]
    Assert.all_eq(sp_a, a)

    # a[30:60] = b[:30] copy range of values, not starting from 0.
    a = np.random.randn(100)
    region_a = np.s_[0:10]
    region_b = np.s_[20:30]
    sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
    a[region_a] = b[region_b]
    Assert.all_eq(sp_a, a)
Ejemplo n.º 3
0
  def _test_optimization_ordered(self):
    na = np.random.rand(1000, 1000)
    nb = np.random.rand(1000, 1000)
    a = expr.from_numpy(na)
    b = expr.from_numpy(nb)

    c = a - b
    d = a + c
    f = c[200:900, 200:900]
    g = d[200:900, 200:900]
    h = f - g
    i = f + h
    j = h[100:500, 100:500]
    k = i[100:500, 100:500]
    l = expr.dot(j, k)
    m = j + k
    n = k - l
    o = n - m
    q = o[100:200, 100:200]

    nc = na - nb
    nd = na + nc
    nf = nc[200:900, 200:900]
    ng = nd[200:900, 200:900]
    nh = nf - ng
    ni = nf + nh
    nj = nh[100:500, 100:500]
    nk = ni[100:500, 100:500]
    nl = np.dot(nj, nk)
    nm = nj + nk
    nn = nk - nl
    no = nn - nm
    nq = no[100:200, 100:200]

    Assert.all_eq(nq, q.optimized().glom(), tolerance = 1e-10)
Ejemplo n.º 4
0
    def test_assign_1d(self):
        b = np.random.randn(100)
        sp_b = from_numpy(b)

        #a[:] = b[:] copy entire array
        a = np.random.randn(100)
        region_a = np.s_[0:100]
        region_b = np.s_[0:100]
        sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
        a[region_a] = b[region_b]
        Assert.all_eq(sp_a, a)

        # a[0] = b[1] copy one value
        a = np.random.randn(100)
        region_a = np.s_[0]
        region_b = np.s_[1]
        sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
        a[region_a] = b[region_b]
        Assert.all_eq(sp_a, a)

        # a[0:10] = b[20:30] copy range of values
        a = np.random.randn(100)
        region_a = np.s_[0:10]
        region_b = np.s_[20:30]
        sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
        a[region_a] = b[region_b]
        Assert.all_eq(sp_a, a)

        # a[30:60] = b[:30] copy range of values, not starting from 0.
        a = np.random.randn(100)
        region_a = np.s_[0:10]
        region_b = np.s_[20:30]
        sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
        a[region_a] = b[region_b]
        Assert.all_eq(sp_a, a)
Ejemplo n.º 5
0
    def fit(self, X, y):
        """
    Parameters
    ----------
    X : array-like of shape = [n_samples, n_features]
        The training input samples.

    y : array-like, shape = [n_samples] or [n_samples, n_outputs]
        The target values (integers that correspond to classes in
        classification, real numbers in regression).

    Returns
    -------
    self : object
        Returns self.
    """
        if isinstance(X, np.ndarray):
            X = expr.from_numpy(X)
        if isinstance(y, np.ndarray):
            y = expr.from_numpy(y)

        X = expr.force(X)
        y = expr.force(y)

        self.n_classes = np.unique(y.glom()).size
        ctx = blob_ctx.get()
        n_workers = ctx.num_workers

        _ = self._create_task_array(n_workers, self.n_estimators)
        task_array = expr.from_numpy(_, tile_hint=(1, )).force()
        target_array = expr.ndarray((task_array.shape[0], ),
                                    dtype=object,
                                    tile_hint=(1, )).force()

        results = task_array.foreach_tile(mapper_fn=_build_mapper,
                                          kw={
                                              'task_array': task_array,
                                              'target_array': target_array,
                                              'X': X,
                                              'y': y,
                                              'criterion': self.criterion,
                                              'max_depth': self.max_depth,
                                              'min_samples_split':
                                              self.min_samples_split,
                                              'min_samples_leaf':
                                              self.min_samples_leaf,
                                              'max_features':
                                              self.max_features,
                                              'bootstrap': self.bootstrap
                                          })

        # Target array stores the local random forest each worker builds,
        # it's used for further prediction.
        self.target_array = target_array
        return self
Ejemplo n.º 6
0
    def test_transpose_dot(self):
        npa1 = np.random.random((401, 97))
        npa2 = np.random.random((401, 97))
        result1 = np.dot(npa1, np.transpose(npa2))
        #result2 = np.dot(np.transpose(npa1), npa2)

        t1 = expr.from_numpy(npa1)
        t2 = expr.from_numpy(npa2)
        t3 = expr.dot(t1, expr.transpose(t2))
        #t4 = expr.dot(expr.transpose(t1), t2)
        assert np.all(np.isclose(result1, t3.glom()))
Ejemplo n.º 7
0
  def test_transpose_dot(self):
    npa1 = np.random.random((401, 97))
    npa2 = np.random.random((401, 97))
    result1 = np.dot(npa1, np.transpose(npa2))
    #result2 = np.dot(np.transpose(npa1), npa2)

    t1 = expr.from_numpy(npa1)
    t2 = expr.from_numpy(npa2)
    t3 = expr.dot(t1, expr.transpose(t2))
    #t4 = expr.dot(expr.transpose(t1), t2)
    assert np.all(np.isclose(result1, t3.glom()))
Ejemplo n.º 8
0
  def fit(self, X, centers=None):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if centers is None:
      centers = expr.from_numpy(np.random.rand(self.n_clusters, num_dim))

    for i in range(self.n_iter):
      X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
      centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1]))
      distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
      labels = expr.argmin(distances, axis=1)
      center_idx = expr.arange((1, centers.shape[0]))
      matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
      matches = matches.astype(np.int64)
      counts = expr.sum(matches, axis=0)
      centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                              matches.shape[1], 1)),
                         axis=0)

      counts = counts.optimized().glom()
      centers = centers.optimized().glom()

      # If any centroids don't have any points assigined to them.
      zcount_indices = (counts == 0).reshape(self.n_clusters)

      if np.any(zcount_indices):
        # One or more centroids may not have any points assigned to them,
        # which results in their position being the zero-vector.  We reseed these
        # centroids with new random values.
        n_points = np.count_nonzero(zcount_indices)
        # In order to get rid of dividing by zero.
        counts[zcount_indices] = 1
        centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

      centers = centers / counts.reshape(centers.shape[0], 1)
      centers = expr.from_numpy(centers)
    return centers, labels	

    '''
Ejemplo n.º 9
0
  def fit(self, X, y):
    """
    Parameters
    ----------
    X : array-like of shape = [n_samples, n_features]
        The training input samples.

    y : array-like, shape = [n_samples] or [n_samples, n_outputs]
        The target values (integers that correspond to classes in
        classification, real numbers in regression).

    Returns
    -------
    self : object
        Returns self.
    """
    if isinstance(X, np.ndarray):
      X = expr.from_numpy(X)
    if isinstance(y, np.ndarray):
      y = expr.from_numpy(y)

    X = X.evaluate()
    y = y.evaluate()

    self.n_classes = np.unique(y.glom()).size
    ctx = blob_ctx.get()
    n_workers = ctx.num_workers

    _ = self._create_task_array(n_workers, self.n_estimators)
    task_array = expr.from_numpy(_, tile_hint=(1, )).evaluate()
    target_array = expr.ndarray((task_array.shape[0], ), dtype=object, tile_hint=(1,)).evaluate()

    results = task_array.foreach_tile(mapper_fn=_build_mapper,
                                      kw={'task_array': task_array,
                                          'target_array': target_array,
                                          'X': X,
                                          'y': y,
                                          'criterion': self.criterion,
                                          'max_depth': self.max_depth,
                                          'min_samples_split': self.min_samples_split,
                                          'min_samples_leaf': self.min_samples_leaf,
                                          'max_features': self.max_features,
                                          'bootstrap': self.bootstrap})

    # Target array stores the local random forest each worker builds,
    # it's used for further prediction.
    self.target_array = target_array
    return self
Ejemplo n.º 10
0
 def test_assign_array_like(self):
   a = np.zeros((20, 10))
   b = np.ones((10, ))
   region = np.s_[10, ]
   sp_a = assign(from_numpy(a), region, b).glom()
   a[region] = b
   Assert.all_eq(sp_a, a)
Ejemplo n.º 11
0
 def test_assign_array_like(self):
     a = np.zeros((20, 10))
     b = np.ones((10, ))
     region = np.s_[10, ]
     sp_a = assign(from_numpy(a), region, b).glom()
     a[region] = b
     Assert.all_eq(sp_a, a)
Ejemplo n.º 12
0
    def test_newaxis(self):
        na = np.arange(100).reshape(10, 10)
        a = expr.from_numpy(na)

        Assert.all_eq(na[np.newaxis, 2:7, 4:8].shape, a[expr.newaxis, 2:7,
                                                        4:8].shape)

        Assert.all_eq(na[np.newaxis, 2:7, np.newaxis, 4:8].shape,
                      a[expr.newaxis, 2:7, expr.newaxis, 4:8].shape)

        Assert.all_eq(
            na[np.newaxis, 2:7, np.newaxis, 4:8, np.newaxis].shape,
            a[expr.newaxis, 2:7, expr.newaxis, 4:8, expr.newaxis].shape)

        #Extreme case
        Assert.all_eq(
            na[np.newaxis, np.newaxis, np.newaxis, np.newaxis, 2:7, np.newaxis,
               np.newaxis, np.newaxis, 4:8, np.newaxis, np.newaxis,
               np.newaxis].shape,
            a[expr.newaxis, expr.newaxis, expr.newaxis, expr.newaxis, 2:7,
              expr.newaxis, expr.newaxis, expr.newaxis, 4:8, expr.newaxis,
              expr.newaxis, expr.newaxis].shape)

        util.log_info(
            '\na.shape:  %s  \nna.shape: %s',
            a[expr.newaxis, 2:7, expr.newaxis, 4:8, expr.newaxis, expr.newaxis,
              expr.newaxis].shape, na[np.newaxis, 2:7, np.newaxis, 4:8,
                                      np.newaxis, np.newaxis,
                                      np.newaxis].shape)
Ejemplo n.º 13
0
  def test_newaxis(self):
    na = np.arange(100).reshape(10, 10)
    a = expr.from_numpy(na)

    Assert.all_eq(na[np.newaxis, 2:7, 4:8].shape,
                  a[expr.newaxis,2:7, 4:8].shape)

    Assert.all_eq(na[np.newaxis, 2:7, np.newaxis, 4:8].shape,
                  a[expr.newaxis,2:7, expr.newaxis, 4:8].shape)

    Assert.all_eq(na[np.newaxis, 2:7, np.newaxis, 4:8, np.newaxis].shape,
                  a[expr.newaxis,2:7, expr.newaxis, 4:8, expr.newaxis].shape)

    #Extreme case
    Assert.all_eq(na[np.newaxis, np.newaxis, np.newaxis, np.newaxis, 2:7, 
		  np.newaxis, np.newaxis, np.newaxis, 4:8, np.newaxis, 
		  np.newaxis, np.newaxis].shape,

                  a[expr.newaxis, expr.newaxis, expr.newaxis, expr.newaxis,
                  2:7, expr.newaxis, expr.newaxis, expr.newaxis, 4:8,
                  expr.newaxis, expr.newaxis, expr.newaxis].shape)

    util.log_info('\na.shape:  %s  \nna.shape: %s',
		  a[expr.newaxis,2:7, expr.newaxis, 4:8, expr.newaxis,
			expr.newaxis, expr.newaxis].shape,

                  na[np.newaxis, 2:7, np.newaxis, 4:8, np.newaxis,
			np.newaxis, np.newaxis].shape)
Ejemplo n.º 14
0
  def kneighbors(self, X, n_neighbors=None):
    """Finds the K-neighbors of a point.

        Returns distance

        Parameters
        ----------
        X : array-like, last dimension same as that of fit data
            The new point.

        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).

        Returns
        -------
        dist : array
            Array representing the lengths to point, only present if
            return_distance=True

        ind : array
            Indices of the nearest points in the population matrix.
    """
    if n_neighbors is not None:
      self.n_neighbors = n_neighbors

    if isinstance(X, np.ndarray):
      X = expr.from_numpy(X)

    if self.algorithm in ('auto', 'brute'):
      X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
      fit_X_broadcast = expr.reshape(self.X, (1, self.X.shape[0], self.X.shape[1]))
      distances = expr.sum((X_broadcast - fit_X_broadcast) ** 2, axis=2)
      neigh_ind = expr.argsort(distances, axis=1)
      neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom()
      neigh_dist = expr.sort(distances, axis=1)
      neigh_dist = expr.sqrt(neigh_dist[:, :n_neighbors]).optimized().glom()
      return neigh_dist, neigh_ind
    else:
      results = self.X.foreach_tile(mapper_fn=_knn_mapper,
                                    kw={'X': self.X, 'Q': X,
                                        'n_neighbors': self.n_neighbors,
                                        'algorithm': self.algorithm})
      dist = None
      ind = None
      """ Get the KNN candidates for each tile of X, then find out the real KNN """
      for k, v in results.iteritems():
        if dist is None:
          dist = v[0]
          ind = v[1]
        else:
          dist = np.concatenate((dist, v[0]), axis=1)
          ind = np.concatenate((ind, v[1]), axis=1)

      mask = np.argsort(dist, axis=1)[:, :self.n_neighbors]
      new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)])
      new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)])
      return new_dist, new_ind
Ejemplo n.º 15
0
  def fit(self, X):
    ctx = blob_ctx.get()
    if isinstance(X, np.ndarray):
      X = expr.from_numpy(X, tile_hint=(X.shape[0] / ctx.num_workers, X.shape[1]))    
    if isinstance(X, expr.Expr):
      X = X.force()

    self.X = X
    return self
Ejemplo n.º 16
0
 def test_from_np1d(self):
   npa = np.random.random((100, 100))
   np.save('_test_write1', npa)
   np.savez('_test_write2', npa)
   t1 = expr.from_file('_test_write1.npy', sparse = False)
   t2 = expr.from_file('_test_write2.npz', sparse = False)
   t3 = expr.from_numpy(npa)
   Assert.all_eq(t1.glom(), npa)
   Assert.all_eq(t2.glom(), npa)
   Assert.all_eq(t3.glom(), npa)
   os.system('rm -rf _test_write1.npy _test_write2.npz')
Ejemplo n.º 17
0
def benchmark_ssvd(ctx, timer):
  DIM = (1280, 1280)
  #A = expr.randn(*DIM, dtype=np.float64)
  A = np.random.randn(*DIM)
  A = expr.from_numpy(A)
  t1 = datetime.now()
  U,S,VT = svd(A)
  t2 = datetime.now()
  cost_time = millis(t1, t2)
    
  print "total cost time:%s ms" % (cost_time)
Ejemplo n.º 18
0
  def test_del_dim(self):
    na = np.arange(100).reshape(10, 10)
    a = expr.from_numpy(na)

    Assert.all_eq(na[2:7, 8], a[2:7, 8].glom())
    Assert.all_eq(na[3:9, 4].shape, a[3:9, 4].shape)

    Assert.all_eq(na[2:7, -1], a[2:7, -1].glom())
    Assert.all_eq(na[-1, 3:9].shape, a[-1, 3:9].shape)

    util.log_info('\na.shape: %s \nna.shape %s', a[3:9, 4].shape, na[3:9, 4].shape)
Ejemplo n.º 19
0
def benchmark_ssvd(ctx, timer):
    DIM = (1280, 1280)
    #A = expr.randn(*DIM, dtype=np.float64)
    A = np.random.randn(*DIM)
    A = expr.from_numpy(A)
    t1 = datetime.now()
    U, S, VT = svd(A)
    t2 = datetime.now()
    cost_time = millis(t1, t2)

    print "total cost time:%s ms" % (cost_time)
Ejemplo n.º 20
0
    def test_del_dim(self):
        na = np.arange(100).reshape(10, 10)
        a = expr.from_numpy(na)

        Assert.all_eq(na[2:7, 8], a[2:7, 8].glom())
        Assert.all_eq(na[3:9, 4].shape, a[3:9, 4].shape)

        Assert.all_eq(na[2:7, -1], a[2:7, -1].glom())
        Assert.all_eq(na[-1, 3:9].shape, a[-1, 3:9].shape)

        util.log_info('\na.shape: %s \nna.shape %s', a[3:9, 4].shape,
                      na[3:9, 4].shape)
Ejemplo n.º 21
0
def benchmark_pca(ctx, timer):
  DIM = (1280, 512)
  data = np.random.randn(*DIM)
  A = expr.from_numpy(data)
  #A = expr.randn(*DIM, dtype=np.float64)
  t1 = datetime.now()
  m = PCA(N_COMPONENTS)
  m.fit(A)
  t2 = datetime.now()
  cost_time = millis(t1, t2)
    
  print "total cost time:%s ms" % (cost_time)
Ejemplo n.º 22
0
  def test_pca(self):
    FLAGS.opt_parakeet_gen = 0
    data = np.random.randn(*DIM)
    A = expr.from_numpy(data, tile_hint=util.calc_tile_hint(DIM, axis=0))
    
    m = PCA(N_COMPONENTS)
    m2 = SK_PCA(N_COMPONENTS)

    m.fit(A)
    m2.fit(data)
    print m2.components_ - m.components_
    assert np.allclose(absolute(m.components_), absolute(m2.components_))
Ejemplo n.º 23
0
    def test_pca(self):
        FLAGS.opt_parakeet_gen = 0
        data = np.random.randn(*DIM)
        A = expr.from_numpy(data, tile_hint=util.calc_tile_hint(DIM, axis=0))

        m = PCA(N_COMPONENTS)
        m2 = SK_PCA(N_COMPONENTS)

        m.fit(A)
        m2.fit(data)
        print m2.components_ - m.components_
        assert np.allclose(absolute(m.components_), absolute(m2.components_))
Ejemplo n.º 24
0
def benchmark_pca(ctx, timer):
    DIM = (1280, 512)
    data = np.random.randn(*DIM)
    A = expr.from_numpy(data)
    #A = expr.randn(*DIM, dtype=np.float64)
    t1 = datetime.now()
    m = PCA(N_COMPONENTS)
    m.fit(A)
    t2 = datetime.now()
    cost_time = millis(t1, t2)

    print "total cost time:%s ms" % (cost_time)
Ejemplo n.º 25
0
    def test_reshape_dot(self):
        npa1 = np.random.random((357, 93))
        npa2 = np.random.random((31, 357))
        result = np.dot(np.reshape(npa1, (1071, 31)), npa2)

        t1 = expr.from_numpy(npa1)
        t2 = expr.from_numpy(npa2)
        t3 = expr.dot(expr.reshape(t1, (1071, 31)), t2)
        Assert.all_eq(result, t3.glom(), 10e-9)

        npa1 = np.random.random((357, 718))
        npa2 = np.random.random((718, ))
        result = np.dot(npa1, np.reshape(npa2, (718, 1)))

        t1 = expr.from_numpy(npa1)
        t2 = expr.from_numpy(npa2)
        t3 = expr.dot(t1, expr.reshape(t2, (718, 1)))
        Assert.all_eq(result, t3.glom(), 10e-9)

        npa1 = np.random.random((718, ))
        npa2 = np.random.random((1, 357))
        result = np.dot(np.reshape(npa1, (718, 1)), npa2)

        t1 = expr.from_numpy(npa1)
        t2 = expr.from_numpy(npa2)
        t3 = expr.dot(expr.reshape(t1, (718, 1)), t2)
        Assert.all_eq(result, t3.glom(), 10e-9)
Ejemplo n.º 26
0
  def test_reshape_dot(self):
    npa1 = np.random.random((357, 93))
    npa2 = np.random.random((31, 357))
    result = np.dot(np.reshape(npa1, (1071, 31)), npa2)

    t1 = expr.from_numpy(npa1)
    t2 = expr.from_numpy(npa2)
    t3 = expr.dot(expr.reshape(t1, (1071, 31)), t2)
    Assert.all_eq(result, t3.glom(), 10e-9)

    npa1 = np.random.random((357, 718))
    npa2 = np.random.random((718, ))
    result = np.dot(npa1, np.reshape(npa2, (718, 1)))

    t1 = expr.from_numpy(npa1)
    t2 = expr.from_numpy(npa2)
    t3 = expr.dot(t1, expr.reshape(t2, (718, 1)))
    Assert.all_eq(result, t3.glom(), 10e-9)

    npa1 = np.random.random((718, ))
    npa2 = np.random.random((1, 357))
    result = np.dot(np.reshape(npa1, (718, 1)), npa2)

    t1 = expr.from_numpy(npa1)
    t2 = expr.from_numpy(npa2)
    t3 = expr.dot(expr.reshape(t1, (718, 1)), t2)
    Assert.all_eq(result, t3.glom(), 10e-9)
Ejemplo n.º 27
0
  def test_optimization_reduced(self):
    na = np.random.rand(1000, 1000)
    nb = np.random.rand(1000, 1000)
    a = expr.from_numpy(na)
    b = expr.from_numpy(nb)

    c = a - b
    d = a + c
    f = c[200:900, 200:900]
    g = d[200:900, 200:900]
    h = f - g
    i = f + h
    j = h[100:500, 100:500]
    k = i[100:500, 100:500]
    l = expr.dot(j, k)
    m = j + k
    n = k - l
    o = n - m
    q = n + o
    r = q - m
    s = expr.sum(r)

    nc = na - nb
    nd = na + nc
    nf = nc[200:900, 200:900]
    ng = nd[200:900, 200:900]
    nh = nf - ng
    ni = nf + nh
    nj = nh[100:500, 100:500]
    nk = ni[100:500, 100:500]
    nl = np.dot(nj, nk)
    nm = nj + nk
    nn = nk - nl
    no = nn - nm
    nq = nn + no
    nr = nq - nm
    ns = np.sum(nr)

    # Our sum seems to reduce precision
    Assert.all_eq(ns, s.optimized().glom(), tolerance = 1e-6)
Ejemplo n.º 28
0
    def test_optimization_reduced(self):
        na = np.random.rand(1000, 1000)
        nb = np.random.rand(1000, 1000)
        a = expr.from_numpy(na)
        b = expr.from_numpy(nb)

        c = a - b
        d = a + c
        f = c[200:900, 200:900]
        g = d[200:900, 200:900]
        h = f - g
        i = f + h
        j = h[100:500, 100:500]
        k = i[100:500, 100:500]
        l = expr.dot(j, k)
        m = j + k
        n = k - l
        o = n - m
        q = n + o
        r = q - m
        s = expr.sum(r)

        nc = na - nb
        nd = na + nc
        nf = nc[200:900, 200:900]
        ng = nd[200:900, 200:900]
        nh = nf - ng
        ni = nf + nh
        nj = nh[100:500, 100:500]
        nk = ni[100:500, 100:500]
        nl = np.dot(nj, nk)
        nm = nj + nk
        nn = nk - nl
        no = nn - nm
        nq = nn + no
        nr = nq - nm
        ns = np.sum(nr)

        # Our sum seems to reduce precision
        Assert.all_eq(ns, s.optimized().glom(), tolerance=1e-6)
Ejemplo n.º 29
0
def benchmark_qr(ctx, timer):
    M = 1280
    N = 1280
    Y = np.random.randn(M, N)
    Y = expr.from_numpy(Y)
    #Y = expr.randn(M, N)

    t1 = datetime.now()
    Q, R = qr(Y)
    t2 = datetime.now()
    cost_time = millis(t1, t2)

    print "total cost time:%s ms" % (cost_time)
Ejemplo n.º 30
0
  def test_optimization_shape(self):
    shape = (200, 800)
    na = np.arange(np.prod(shape), dtype=np.int).reshape(shape)
    nb = np.random.randint(1, 1000, (1000, 1000))
    nc = np.random.randint(1, 1000, (1000, 1000))
    a = expr.arange(shape, dtype=np.int)
    b = expr.from_numpy(nb)
    c = expr.from_numpy(nc)

    d = b + c
    e = b + d
    f = d[200:900, 200:900]
    g = e[200:900, 200:900]
    h = f + g
    i = f + h
    j = h[100:500, 100:500]
    k = i[100:300, 100:300]
    l = expr.reshape(expr.ravel(j), (800, 200))
    m = expr.dot(a, l)
    n = m + k
    o = n + m 
    q = o[100:200, 100:200]

    nd = nb + nc
    ne = nb + nd
    nf = nd[200:900, 200:900]
    ng = ne[200:900, 200:900]
    nh = nf + ng
    ni = nf + nh
    nj = nh[100:500, 100:500]
    nk = ni[100:300, 100:300]
    nl = np.reshape(np.ravel(nj), (800, 200))
    nm = np.dot(na, nl)
    nn = nm + nk
    no = nn + nm 
    nq = no[100:200, 100:200]


    Assert.all_eq(nq, q.optimized().glom(), tolerance = 1e-10)
Ejemplo n.º 31
0
def benchmark_qr(ctx, timer):
  M = 1280
  N = 1280
  Y = np.random.randn(M, N)
  Y = expr.from_numpy(Y)
  #Y = expr.randn(M, N)

  t1 = datetime.now()
  Q, R = qr(Y)
  t2 = datetime.now()
  cost_time = millis(t1, t2)
    
  print "total cost time:%s ms" % (cost_time)
Ejemplo n.º 32
0
  def test_ndimension(self):
    for case in xrange(5):
      dim = np.random.randint(low=2, high=6)
      shape = np.random.randint(low=5, high=11, size=dim)
      util.log_info('Test Case #%s: DIM(%s) shape%s', case + 1, dim, shape)

      na = new_ndarray(shape)
      a = expr.from_numpy(na)

      for axis in xrange(dim):
        Assert.all_eq(expr.sort(a, axis).glom(),
                      np.sort(na, axis))
        Assert.all_eq(expr.argsort(a, axis).glom(),
                      np.argsort(na, axis))
Ejemplo n.º 33
0
  def test_combo(self):
    na = np.arange(100).reshape(10, 10)
    a = expr.from_numpy(na)

    Assert.all_eq(na[np.newaxis, 2:7, 4],
                  a[expr.newaxis, 2:7, 4].glom())
    Assert.all_eq(na[2:7, np.newaxis, -1],
                  a[2:7, expr.newaxis, -1].glom())
    Assert.all_eq(na[-1, np.newaxis, 2:7],
                  a[-1, expr.newaxis, 2:7].glom())
    Assert.all_eq(na[np.newaxis, 2:7, np.newaxis, np.newaxis, 4, np.newaxis, np.newaxis],
                  a[expr.newaxis, 2:7, expr.newaxis, expr.newaxis, 4, expr.newaxis, expr.newaxis].glom())

    util.log_info('\na.shape:  %s \nna.shape: %s',
		a[expr.newaxis, 2:7, expr.newaxis, expr.newaxis, -1, expr.newaxis, expr.newaxis].shape,
                na[np.newaxis, 2:7, np.newaxis, np.newaxis, -1, np.newaxis, np.newaxis].shape)
Ejemplo n.º 34
0
    def test_combo(self):
        na = np.arange(100).reshape(10, 10)
        a = expr.from_numpy(na)

        Assert.all_eq(na[np.newaxis, 2:7, 4], a[expr.newaxis, 2:7, 4].glom())
        Assert.all_eq(na[2:7, np.newaxis, -1], a[2:7, expr.newaxis, -1].glom())
        Assert.all_eq(na[-1, np.newaxis, 2:7], a[-1, expr.newaxis, 2:7].glom())
        Assert.all_eq(
            na[np.newaxis, 2:7, np.newaxis, np.newaxis, 4, np.newaxis,
               np.newaxis], a[expr.newaxis, 2:7, expr.newaxis, expr.newaxis, 4,
                              expr.newaxis, expr.newaxis].glom())

        util.log_info(
            '\na.shape:  %s \nna.shape: %s',
            a[expr.newaxis, 2:7, expr.newaxis, expr.newaxis, -1, expr.newaxis,
              expr.newaxis].shape, na[np.newaxis, 2:7, np.newaxis, np.newaxis,
                                      -1, np.newaxis, np.newaxis].shape)
Ejemplo n.º 35
0
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'):
    '''
  clustering data points using kmeans spectral clustering method.

  Args:
    points(Expr or DistArray): the data points to be clustered.
    k(int): the number of clusters we need to generate.
    num_iter(int): the max number of iterations that kmeans clustering method runs. 
    similarity_measurement(str): distance method used to measure similarity between two points.
  '''
    # calculate similarity for each pair of points to generate the adjacency matrix A
    A = expr.shuffle(points,
                     _row_similarity_mapper,
                     kw={'similarity_measurement': similarity_measurement},
                     shape_hint=(points.shape[0], points.shape[0]))

    num_dims = A.shape[1]

    # Construct the diagonal matrix D
    D = expr.sum(A, axis=1, tile_hint=(A.shape[0], ))

    # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
    L = expr.shuffle(A, _laplacian_mapper, kw={'D': D}, shape_hint=A.shape)

    # Perform eigen-decomposition using Lanczos solver
    overshoot = min(k * 2, num_dims)
    d, U = lanczos.solve(L, L, overshoot, True)
    U = U[:, 0:k]

    # Generate initial clusters which picks rows as centers if that row contains max eigen
    # value in that column
    init_clusters = U[np.argmax(U, axis=0)]

    # Run kmeans clustering with init_clusters
    kmeans = KMeans(k, num_iter)
    U = expr.from_numpy(U)
    centers, labels = kmeans.fit(U, init_clusters)

    return labels
Ejemplo n.º 36
0
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'):
  '''
  clustering data points using kmeans spectral clustering method.

  Args:
    points(Expr or DistArray): the data points to be clustered.
    k(int): the number of clusters we need to generate.
    num_iter(int): the max number of iterations that kmeans clustering method runs. 
    similarity_measurement(str): distance method used to measure similarity between two points.
  '''  
  # calculate similarity for each pair of points to generate the adjacency matrix A
  A = expr.shuffle(points, _row_similarity_mapper, kw={'similarity_measurement': similarity_measurement})
  
  num_dims = A.shape[1]
  
  # Construct the diagonal matrix D
  D = expr.sum(A, axis=1, tile_hint=(A.shape[0],))
  
  # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
  L = expr.shuffle(A, _laplacian_mapper, kw={'D': D})
  
  # Perform eigen-decomposition using Lanczos solver
  overshoot = min(k * 2, num_dims) 
  d, U = lanczos.solve(L, L, overshoot, True)
  U = U[:, 0:k]
  
  # Generate initial clusters which picks rows as centers if that row contains max eigen 
  # value in that column
  init_clusters = U[np.argmax(U, axis=0)]
  
  # Run kmeans clustering with init_clusters
  kmeans = KMeans(k, num_iter)
  U = expr.from_numpy(U)
  centers, labels = kmeans.fit(U, init_clusters)
  
  return labels
Ejemplo n.º 37
0
    def test_assign_expr(self):
        # Small matrix
        a = np.random.randn(20, 10)
        b = np.random.randn(10)
        region_a = np.s_[10, ]
        sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom()
        a[region_a] = b
        Assert.all_eq(sp_a, a)

        # Larger matrix
        a = np.random.randn(200, 100)
        b = np.random.randn(100)
        region_a = np.s_[50, ]
        sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom()
        a[region_a] = b
        Assert.all_eq(sp_a, a)

        # Worst case region
        a = np.random.randn(200, 100)
        b = np.random.randn(3, 50)
        region_a = np.s_[99:102, 25:75]
        sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom()
        a[region_a] = b
        Assert.all_eq(sp_a, a)
Ejemplo n.º 38
0
  def test_assign_expr(self):
    # Small matrix
    a = np.random.randn(20, 10)
    b = np.random.randn(10)
    region_a = np.s_[10, ]
    sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom()
    a[region_a] = b
    Assert.all_eq(sp_a, a)

    # Larger matrix
    a = np.random.randn(200, 100)
    b = np.random.randn(100)
    region_a = np.s_[50, ]
    sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom()
    a[region_a] = b
    Assert.all_eq(sp_a, a)

    # Worst case region
    a = np.random.randn(200, 100)
    b = np.random.randn(3, 50)
    region_a = np.s_[99:102, 25:75]
    sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom()
    a[region_a] = b
    Assert.all_eq(sp_a, a)
Ejemplo n.º 39
0
def solve(A, AT, desired_rank, is_symmetric=False):
  '''
  A simple implementation of the Lanczos algorithm
  (http://en.wikipedia.org/wiki/Lanczos_algorithm) for eigenvalue computation.

  Like the Mahout implementation, only the matrix*vector step is parallelized.
  
  First we use lanczos method to turn the matrix into tridiagonoal form. Then
  we use numpy.linalg.eig function to extract the eigenvalues and eigenvectors 
  from the tridiagnonal matrix(desired_rank*desired_rank). Since desired_rank 
  should be smaller than the size of matrix, so we could it in local machine 
  efficiently. 
  '''
  # Calculate two more eigenvalues, but we only keep the largest desired_rank
  # one. Doing this to keep the result consistent with scipy.sparse.linalg.svds.
  desired_rank += 2

  n = A.shape[1]
  v_next = np.ones(n) / np.sqrt(n)
  v_prev = np.zeros(n)
  beta = np.zeros(desired_rank+1)
  beta[0] = 0
  alpha = np.zeros(desired_rank)

  # Since the disiredRank << size of matrix, so we keep
  # V in local memory for efficiency reason(It needs to be updated
  # for every iteration). 
  # If the case which V can't be fit in local memory occurs, 
  # you could turn it into spartan distributed array. 
  V = np.zeros((n, desired_rank))


  for i in range(0, desired_rank):
    util.log_info("Iter : %s", i)
    v_next_expr = expr.from_numpy(v_next.reshape(n, 1))

    if is_symmetric:
      w = expr.dot(A, v_next_expr).optimized().glom().reshape(n)
    else:
      w = expr.dot(A, v_next_expr)
      w = expr.dot(AT, w).optimized().glom().reshape(n)

    alpha[i] = np.dot(w, v_next)
    w = w - alpha[i] * v_next - beta[i] * v_prev
    
    # Orthogonalize:
    for t in range(i):
      tmpa = np.dot(w, V[:, t])
      if tmpa == 0.0:
        continue
      w -= tmpa * V[:, t] 

    beta[i+1] = np.linalg.norm(w, 2) 
    v_prev = v_next
    v_next = w / beta[i+1]
    V[:, i] = v_prev
  
  # Create tridiag matrix with size (desired_rank X desired_rank)  
  tridiag = np.diag(alpha)
  for i in range(0, desired_rank-1):
    tridiag[i, i+1] = beta[i+1] 
    tridiag[i+1, i] = beta[i+1]
  
  # Get eigenvectors and eigenvalues of this tridiagonal matrix.  
  # The eigenvalues of this tridiagnoal matrix equals to the eigenvalues
  # of matrix dot(A, A.T.). We can get the eigenvectors of dot(A, A.T) 
  # by multiplying V with eigenvectors of this tridiagonal matrix.
  d, v = np.linalg.eig(tridiag) 
  
  # Sort eigenvalues and their corresponding eigenvectors 
  sorted_idx = np.argsort(np.absolute(d))[::-1]
  d = d[sorted_idx]
  v = v[:, sorted_idx]
  
  # Get the eigenvetors of dot(A, A.T)
  s = np.dot(V, v)
  return d[0:desired_rank-2], s[:, 0:desired_rank-2] 
Ejemplo n.º 40
0
  def fit(self, X):
    if isinstance(X, np.ndarray):
      X = expr.from_numpy(X)

    self.X = X
    return self
Ejemplo n.º 41
0
    def kneighbors(self, X, n_neighbors=None):
        """Finds the K-neighbors of a point.

        Returns distance

        Parameters
        ----------
        X : array-like, last dimension same as that of fit data
            The new point.

        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).

        Returns
        -------
        dist : array
            Array representing the lengths to point, only present if
            return_distance=True

        ind : array
            Indices of the nearest points in the population matrix.
    """
        if n_neighbors is not None:
            self.n_neighbors = n_neighbors

        if isinstance(X, np.ndarray):
            X = expr.from_numpy(X)

        if self.algorithm in ('auto', 'brute'):
            X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
            fit_X_broadcast = expr.reshape(
                self.X, (1, self.X.shape[0], self.X.shape[1]))
            distances = expr.sum((X_broadcast - fit_X_broadcast)**2, axis=2)
            neigh_ind = expr.argsort(distances, axis=1)
            neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom()
            neigh_dist = expr.sort(distances, axis=1)
            neigh_dist = expr.sqrt(
                neigh_dist[:, :n_neighbors]).optimized().glom()
            return neigh_dist, neigh_ind
        else:
            results = self.X.foreach_tile(mapper_fn=_knn_mapper,
                                          kw={
                                              'X': self.X,
                                              'Q': X,
                                              'n_neighbors': self.n_neighbors,
                                              'algorithm': self.algorithm
                                          })
            dist = None
            ind = None
            """ Get the KNN candidates for each tile of X, then find out the real KNN """
            for k, v in results.iteritems():
                if dist is None:
                    dist = v[0]
                    ind = v[1]
                else:
                    dist = np.concatenate((dist, v[0]), axis=1)
                    ind = np.concatenate((ind, v[1]), axis=1)

            mask = np.argsort(dist, axis=1)[:, :self.n_neighbors]
            new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)])
            new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)])
            return new_dist, new_ind
Ejemplo n.º 42
0
    def fit(self, X, centers=None, implementation='map2'):
        """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
        num_dim = X.shape[1]
        num_points = X.shape[0]

        labels = expr.zeros((num_points, 1), dtype=np.int)

        if implementation == 'map2':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.map2(X,
                                   0,
                                   fn=kmeans_map2_dist_mapper,
                                   fn_kw={"centers": centers},
                                   shape=(X.shape[0], ))

                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
            return centers, labels

        elif implementation == 'outer':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.outer((X, centers), (0, None),
                                    fn=kmeans_outer_dist_mapper,
                                    shape=(X.shape[0], ))
                #labels = expr.argmin(distances, axis=1)
                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'broadcast':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                util.log_warn("k_means_ %d %d", i, time.time())
                X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
                centers_broadcast = expr.reshape(
                    centers, (1, centers.shape[0], centers.shape[1]))
                distances = expr.sum(expr.square(X_broadcast -
                                                 centers_broadcast),
                                     axis=2)
                labels = expr.argmin(distances, axis=1)
                center_idx = expr.arange((1, centers.shape[0]))
                matches = expr.reshape(labels,
                                       (labels.shape[0], 1)) == center_idx
                matches = matches.astype(np.int64)
                counts = expr.sum(matches, axis=0)
                centers = expr.sum(
                    X_broadcast *
                    expr.reshape(matches,
                                 (matches.shape[0], matches.shape[1], 1)),
                    axis=0)

                counts = counts.optimized().glom()
                centers = centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'shuffle':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                # Reset them to zero.
                new_centers = expr.ndarray((self.n_clusters, num_dim),
                                           reduce_fn=lambda a, b: a + b)
                new_counts = expr.ndarray((self.n_clusters, 1),
                                          dtype=np.int,
                                          reduce_fn=lambda a, b: a + b)

                _ = expr.shuffle(X,
                                 _find_cluster_mapper,
                                 kw={
                                     'd_pts': X,
                                     'old_centers': centers,
                                     'new_centers': new_centers,
                                     'new_counts': new_counts,
                                     'labels': labels
                                 },
                                 shape_hint=(1, ),
                                 cost_hint={
                                     hash(labels): {
                                         '00': 0,
                                         '01': np.prod(labels.shape)
                                     }
                                 })
                _.force()

                new_counts = new_counts.glom()
                new_centers = new_centers.glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (new_counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    new_counts[zcount_indices] = 1
                    new_centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                new_centers = new_centers / new_counts
                centers = new_centers

            return centers, labels
Ejemplo n.º 43
0
    def fit(self, X):
        if isinstance(X, np.ndarray):
            X = expr.from_numpy(X)

        self.X = X
        return self
Ejemplo n.º 44
0
  def fit(self, X, centers=None, implementation='outer'):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if implementation == 'map2':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers},
                           shape=(X.shape[0], ))

        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
      return centers, labels

    elif implementation == 'outer':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper,
                            shape=(X.shape[0],))
        #labels = expr.argmin(distances, axis=1)
        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'broadcast':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        util.log_warn("k_means_ %d %d", i, time.time())
        X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
        centers_broadcast = expr.reshape(centers, (1, centers.shape[0],
                                                   centers.shape[1]))
        distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
        labels = expr.argmin(distances, axis=1)
        center_idx = expr.arange((1, centers.shape[0]))
        matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
        matches = matches.astype(np.int64)
        counts = expr.sum(matches, axis=0)
        centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                                matches.shape[1], 1)),
                           axis=0)

        counts = counts.optimized().glom()
        centers = centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'shuffle':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        # Reset them to zero.
        new_centers = expr.ndarray((self.n_clusters, num_dim),
                                   reduce_fn=lambda a, b: a + b)
        new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int,
                                  reduce_fn=lambda a, b: a + b)

        _ = expr.shuffle(X,
                         _find_cluster_mapper,
                         kw={'d_pts': X,
                             'old_centers': centers,
                             'new_centers': new_centers,
                             'new_counts': new_counts,
                             'labels': labels},
                         shape_hint=(1,),
                         cost_hint={hash(labels): {'00': 0,
                                                   '01': np.prod(labels.shape)}})
        _.force()

        new_counts = new_counts.glom()
        new_centers = new_centers.glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (new_counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          new_counts[zcount_indices] = 1
          new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        new_centers = new_centers / new_counts
        centers = new_centers

      return centers, labels