コード例 #1
0
    def _test_optimization_ordered(self):
        na = np.random.rand(1000, 1000)
        nb = np.random.rand(1000, 1000)
        a = expr.from_numpy(na)
        b = expr.from_numpy(nb)

        c = a - b
        d = a + c
        f = c[200:900, 200:900]
        g = d[200:900, 200:900]
        h = f - g
        i = f + h
        j = h[100:500, 100:500]
        k = i[100:500, 100:500]
        l = expr.dot(j, k)
        m = j + k
        n = k - l
        o = n - m
        q = o[100:200, 100:200]

        nc = na - nb
        nd = na + nc
        nf = nc[200:900, 200:900]
        ng = nd[200:900, 200:900]
        nh = nf - ng
        ni = nf + nh
        nj = nh[100:500, 100:500]
        nk = ni[100:500, 100:500]
        nl = np.dot(nj, nk)
        nm = nj + nk
        nn = nk - nl
        no = nn - nm
        nq = no[100:200, 100:200]

        Assert.all_eq(nq, q.optimized().glom(), tolerance=1e-10)
コード例 #2
0
ファイル: test_assign.py プロジェクト: GabrielWen/spartan
  def test_assign_1d(self):
    b = np.random.randn(100)
    sp_b = from_numpy(b)

    #a[:] = b[:] copy entire array
    a = np.random.randn(100)
    region_a = np.s_[0:100]
    region_b = np.s_[0:100]
    sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
    a[region_a] = b[region_b]
    Assert.all_eq(sp_a, a)

    # a[0] = b[1] copy one value
    a = np.random.randn(100)
    region_a = np.s_[0]
    region_b = np.s_[1]
    sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
    a[region_a] = b[region_b]
    Assert.all_eq(sp_a, a)

    # a[0:10] = b[20:30] copy range of values
    a = np.random.randn(100)
    region_a = np.s_[0:10]
    region_b = np.s_[20:30]
    sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
    a[region_a] = b[region_b]
    Assert.all_eq(sp_a, a)

    # a[30:60] = b[:30] copy range of values, not starting from 0.
    a = np.random.randn(100)
    region_a = np.s_[0:10]
    region_b = np.s_[20:30]
    sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
    a[region_a] = b[region_b]
    Assert.all_eq(sp_a, a)
コード例 #3
0
ファイル: test_optimization.py プロジェクト: MaggieQi/spartan
  def _test_optimization_ordered(self):
    na = np.random.rand(1000, 1000)
    nb = np.random.rand(1000, 1000)
    a = expr.from_numpy(na)
    b = expr.from_numpy(nb)

    c = a - b
    d = a + c
    f = c[200:900, 200:900]
    g = d[200:900, 200:900]
    h = f - g
    i = f + h
    j = h[100:500, 100:500]
    k = i[100:500, 100:500]
    l = expr.dot(j, k)
    m = j + k
    n = k - l
    o = n - m
    q = o[100:200, 100:200]

    nc = na - nb
    nd = na + nc
    nf = nc[200:900, 200:900]
    ng = nd[200:900, 200:900]
    nh = nf - ng
    ni = nf + nh
    nj = nh[100:500, 100:500]
    nk = ni[100:500, 100:500]
    nl = np.dot(nj, nk)
    nm = nj + nk
    nn = nk - nl
    no = nn - nm
    nq = no[100:200, 100:200]

    Assert.all_eq(nq, q.optimized().glom(), tolerance = 1e-10)
コード例 #4
0
ファイル: test_assign.py プロジェクト: muddimedia/spartan-1
    def test_assign_1d(self):
        b = np.random.randn(100)
        sp_b = from_numpy(b)

        #a[:] = b[:] copy entire array
        a = np.random.randn(100)
        region_a = np.s_[0:100]
        region_b = np.s_[0:100]
        sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
        a[region_a] = b[region_b]
        Assert.all_eq(sp_a, a)

        # a[0] = b[1] copy one value
        a = np.random.randn(100)
        region_a = np.s_[0]
        region_b = np.s_[1]
        sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
        a[region_a] = b[region_b]
        Assert.all_eq(sp_a, a)

        # a[0:10] = b[20:30] copy range of values
        a = np.random.randn(100)
        region_a = np.s_[0:10]
        region_b = np.s_[20:30]
        sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
        a[region_a] = b[region_b]
        Assert.all_eq(sp_a, a)

        # a[30:60] = b[:30] copy range of values, not starting from 0.
        a = np.random.randn(100)
        region_a = np.s_[0:10]
        region_b = np.s_[20:30]
        sp_a = assign(from_numpy(a), region_a, sp_b[region_b]).glom()
        a[region_a] = b[region_b]
        Assert.all_eq(sp_a, a)
コード例 #5
0
    def fit(self, X, y):
        """
    Parameters
    ----------
    X : array-like of shape = [n_samples, n_features]
        The training input samples.

    y : array-like, shape = [n_samples] or [n_samples, n_outputs]
        The target values (integers that correspond to classes in
        classification, real numbers in regression).

    Returns
    -------
    self : object
        Returns self.
    """
        if isinstance(X, np.ndarray):
            X = expr.from_numpy(X)
        if isinstance(y, np.ndarray):
            y = expr.from_numpy(y)

        X = expr.force(X)
        y = expr.force(y)

        self.n_classes = np.unique(y.glom()).size
        ctx = blob_ctx.get()
        n_workers = ctx.num_workers

        _ = self._create_task_array(n_workers, self.n_estimators)
        task_array = expr.from_numpy(_, tile_hint=(1, )).force()
        target_array = expr.ndarray((task_array.shape[0], ),
                                    dtype=object,
                                    tile_hint=(1, )).force()

        results = task_array.foreach_tile(mapper_fn=_build_mapper,
                                          kw={
                                              'task_array': task_array,
                                              'target_array': target_array,
                                              'X': X,
                                              'y': y,
                                              'criterion': self.criterion,
                                              'max_depth': self.max_depth,
                                              'min_samples_split':
                                              self.min_samples_split,
                                              'min_samples_leaf':
                                              self.min_samples_leaf,
                                              'max_features':
                                              self.max_features,
                                              'bootstrap': self.bootstrap
                                          })

        # Target array stores the local random forest each worker builds,
        # it's used for further prediction.
        self.target_array = target_array
        return self
コード例 #6
0
    def test_transpose_dot(self):
        npa1 = np.random.random((401, 97))
        npa2 = np.random.random((401, 97))
        result1 = np.dot(npa1, np.transpose(npa2))
        #result2 = np.dot(np.transpose(npa1), npa2)

        t1 = expr.from_numpy(npa1)
        t2 = expr.from_numpy(npa2)
        t3 = expr.dot(t1, expr.transpose(t2))
        #t4 = expr.dot(expr.transpose(t1), t2)
        assert np.all(np.isclose(result1, t3.glom()))
コード例 #7
0
ファイル: test_transpose.py プロジェクト: GabrielWen/spartan
  def test_transpose_dot(self):
    npa1 = np.random.random((401, 97))
    npa2 = np.random.random((401, 97))
    result1 = np.dot(npa1, np.transpose(npa2))
    #result2 = np.dot(np.transpose(npa1), npa2)

    t1 = expr.from_numpy(npa1)
    t2 = expr.from_numpy(npa2)
    t3 = expr.dot(t1, expr.transpose(t2))
    #t4 = expr.dot(expr.transpose(t1), t2)
    assert np.all(np.isclose(result1, t3.glom()))
コード例 #8
0
ファイル: k_means_.py プロジェクト: xuanhan863/spartan
  def fit(self, X, centers=None):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if centers is None:
      centers = expr.from_numpy(np.random.rand(self.n_clusters, num_dim))

    for i in range(self.n_iter):
      X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
      centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1]))
      distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
      labels = expr.argmin(distances, axis=1)
      center_idx = expr.arange((1, centers.shape[0]))
      matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
      matches = matches.astype(np.int64)
      counts = expr.sum(matches, axis=0)
      centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                              matches.shape[1], 1)),
                         axis=0)

      counts = counts.optimized().glom()
      centers = centers.optimized().glom()

      # If any centroids don't have any points assigined to them.
      zcount_indices = (counts == 0).reshape(self.n_clusters)

      if np.any(zcount_indices):
        # One or more centroids may not have any points assigned to them,
        # which results in their position being the zero-vector.  We reseed these
        # centroids with new random values.
        n_points = np.count_nonzero(zcount_indices)
        # In order to get rid of dividing by zero.
        counts[zcount_indices] = 1
        centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

      centers = centers / counts.reshape(centers.shape[0], 1)
      centers = expr.from_numpy(centers)
    return centers, labels	

    '''
コード例 #9
0
ファイル: forest.py プロジェクト: GabrielWen/spartan
  def fit(self, X, y):
    """
    Parameters
    ----------
    X : array-like of shape = [n_samples, n_features]
        The training input samples.

    y : array-like, shape = [n_samples] or [n_samples, n_outputs]
        The target values (integers that correspond to classes in
        classification, real numbers in regression).

    Returns
    -------
    self : object
        Returns self.
    """
    if isinstance(X, np.ndarray):
      X = expr.from_numpy(X)
    if isinstance(y, np.ndarray):
      y = expr.from_numpy(y)

    X = X.evaluate()
    y = y.evaluate()

    self.n_classes = np.unique(y.glom()).size
    ctx = blob_ctx.get()
    n_workers = ctx.num_workers

    _ = self._create_task_array(n_workers, self.n_estimators)
    task_array = expr.from_numpy(_, tile_hint=(1, )).evaluate()
    target_array = expr.ndarray((task_array.shape[0], ), dtype=object, tile_hint=(1,)).evaluate()

    results = task_array.foreach_tile(mapper_fn=_build_mapper,
                                      kw={'task_array': task_array,
                                          'target_array': target_array,
                                          'X': X,
                                          'y': y,
                                          'criterion': self.criterion,
                                          'max_depth': self.max_depth,
                                          'min_samples_split': self.min_samples_split,
                                          'min_samples_leaf': self.min_samples_leaf,
                                          'max_features': self.max_features,
                                          'bootstrap': self.bootstrap})

    # Target array stores the local random forest each worker builds,
    # it's used for further prediction.
    self.target_array = target_array
    return self
コード例 #10
0
ファイル: test_assign.py プロジェクト: GabrielWen/spartan
 def test_assign_array_like(self):
   a = np.zeros((20, 10))
   b = np.ones((10, ))
   region = np.s_[10, ]
   sp_a = assign(from_numpy(a), region, b).glom()
   a[region] = b
   Assert.all_eq(sp_a, a)
コード例 #11
0
ファイル: test_assign.py プロジェクト: muddimedia/spartan-1
 def test_assign_array_like(self):
     a = np.zeros((20, 10))
     b = np.ones((10, ))
     region = np.s_[10, ]
     sp_a = assign(from_numpy(a), region, b).glom()
     a[region] = b
     Assert.all_eq(sp_a, a)
コード例 #12
0
    def test_newaxis(self):
        na = np.arange(100).reshape(10, 10)
        a = expr.from_numpy(na)

        Assert.all_eq(na[np.newaxis, 2:7, 4:8].shape, a[expr.newaxis, 2:7,
                                                        4:8].shape)

        Assert.all_eq(na[np.newaxis, 2:7, np.newaxis, 4:8].shape,
                      a[expr.newaxis, 2:7, expr.newaxis, 4:8].shape)

        Assert.all_eq(
            na[np.newaxis, 2:7, np.newaxis, 4:8, np.newaxis].shape,
            a[expr.newaxis, 2:7, expr.newaxis, 4:8, expr.newaxis].shape)

        #Extreme case
        Assert.all_eq(
            na[np.newaxis, np.newaxis, np.newaxis, np.newaxis, 2:7, np.newaxis,
               np.newaxis, np.newaxis, 4:8, np.newaxis, np.newaxis,
               np.newaxis].shape,
            a[expr.newaxis, expr.newaxis, expr.newaxis, expr.newaxis, 2:7,
              expr.newaxis, expr.newaxis, expr.newaxis, 4:8, expr.newaxis,
              expr.newaxis, expr.newaxis].shape)

        util.log_info(
            '\na.shape:  %s  \nna.shape: %s',
            a[expr.newaxis, 2:7, expr.newaxis, 4:8, expr.newaxis, expr.newaxis,
              expr.newaxis].shape, na[np.newaxis, 2:7, np.newaxis, 4:8,
                                      np.newaxis, np.newaxis,
                                      np.newaxis].shape)
コード例 #13
0
ファイル: test_newaxis.py プロジェクト: GabrielWen/spartan
  def test_newaxis(self):
    na = np.arange(100).reshape(10, 10)
    a = expr.from_numpy(na)

    Assert.all_eq(na[np.newaxis, 2:7, 4:8].shape,
                  a[expr.newaxis,2:7, 4:8].shape)

    Assert.all_eq(na[np.newaxis, 2:7, np.newaxis, 4:8].shape,
                  a[expr.newaxis,2:7, expr.newaxis, 4:8].shape)

    Assert.all_eq(na[np.newaxis, 2:7, np.newaxis, 4:8, np.newaxis].shape,
                  a[expr.newaxis,2:7, expr.newaxis, 4:8, expr.newaxis].shape)

    #Extreme case
    Assert.all_eq(na[np.newaxis, np.newaxis, np.newaxis, np.newaxis, 2:7, 
		  np.newaxis, np.newaxis, np.newaxis, 4:8, np.newaxis, 
		  np.newaxis, np.newaxis].shape,

                  a[expr.newaxis, expr.newaxis, expr.newaxis, expr.newaxis,
                  2:7, expr.newaxis, expr.newaxis, expr.newaxis, 4:8,
                  expr.newaxis, expr.newaxis, expr.newaxis].shape)

    util.log_info('\na.shape:  %s  \nna.shape: %s',
		  a[expr.newaxis,2:7, expr.newaxis, 4:8, expr.newaxis,
			expr.newaxis, expr.newaxis].shape,

                  na[np.newaxis, 2:7, np.newaxis, 4:8, np.newaxis,
			np.newaxis, np.newaxis].shape)
コード例 #14
0
ファイル: unsupervised.py プロジェクト: GabrielWen/spartan
  def kneighbors(self, X, n_neighbors=None):
    """Finds the K-neighbors of a point.

        Returns distance

        Parameters
        ----------
        X : array-like, last dimension same as that of fit data
            The new point.

        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).

        Returns
        -------
        dist : array
            Array representing the lengths to point, only present if
            return_distance=True

        ind : array
            Indices of the nearest points in the population matrix.
    """
    if n_neighbors is not None:
      self.n_neighbors = n_neighbors

    if isinstance(X, np.ndarray):
      X = expr.from_numpy(X)

    if self.algorithm in ('auto', 'brute'):
      X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
      fit_X_broadcast = expr.reshape(self.X, (1, self.X.shape[0], self.X.shape[1]))
      distances = expr.sum((X_broadcast - fit_X_broadcast) ** 2, axis=2)
      neigh_ind = expr.argsort(distances, axis=1)
      neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom()
      neigh_dist = expr.sort(distances, axis=1)
      neigh_dist = expr.sqrt(neigh_dist[:, :n_neighbors]).optimized().glom()
      return neigh_dist, neigh_ind
    else:
      results = self.X.foreach_tile(mapper_fn=_knn_mapper,
                                    kw={'X': self.X, 'Q': X,
                                        'n_neighbors': self.n_neighbors,
                                        'algorithm': self.algorithm})
      dist = None
      ind = None
      """ Get the KNN candidates for each tile of X, then find out the real KNN """
      for k, v in results.iteritems():
        if dist is None:
          dist = v[0]
          ind = v[1]
        else:
          dist = np.concatenate((dist, v[0]), axis=1)
          ind = np.concatenate((ind, v[1]), axis=1)

      mask = np.argsort(dist, axis=1)[:, :self.n_neighbors]
      new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)])
      new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)])
      return new_dist, new_ind
コード例 #15
0
ファイル: unsupervised.py プロジェクト: EasonLiao/spartan
  def fit(self, X):
    ctx = blob_ctx.get()
    if isinstance(X, np.ndarray):
      X = expr.from_numpy(X, tile_hint=(X.shape[0] / ctx.num_workers, X.shape[1]))    
    if isinstance(X, expr.Expr):
      X = X.force()

    self.X = X
    return self
コード例 #16
0
 def test_from_np1d(self):
   npa = np.random.random((100, 100))
   np.save('_test_write1', npa)
   np.savez('_test_write2', npa)
   t1 = expr.from_file('_test_write1.npy', sparse = False)
   t2 = expr.from_file('_test_write2.npz', sparse = False)
   t3 = expr.from_numpy(npa)
   Assert.all_eq(t1.glom(), npa)
   Assert.all_eq(t2.glom(), npa)
   Assert.all_eq(t3.glom(), npa)
   os.system('rm -rf _test_write1.npy _test_write2.npz')
コード例 #17
0
ファイル: test_ssvd.py プロジェクト: rgardner/spartan
def benchmark_ssvd(ctx, timer):
  DIM = (1280, 1280)
  #A = expr.randn(*DIM, dtype=np.float64)
  A = np.random.randn(*DIM)
  A = expr.from_numpy(A)
  t1 = datetime.now()
  U,S,VT = svd(A)
  t2 = datetime.now()
  cost_time = millis(t1, t2)
    
  print "total cost time:%s ms" % (cost_time)
コード例 #18
0
ファイル: test_newaxis.py プロジェクト: GabrielWen/spartan
  def test_del_dim(self):
    na = np.arange(100).reshape(10, 10)
    a = expr.from_numpy(na)

    Assert.all_eq(na[2:7, 8], a[2:7, 8].glom())
    Assert.all_eq(na[3:9, 4].shape, a[3:9, 4].shape)

    Assert.all_eq(na[2:7, -1], a[2:7, -1].glom())
    Assert.all_eq(na[-1, 3:9].shape, a[-1, 3:9].shape)

    util.log_info('\na.shape: %s \nna.shape %s', a[3:9, 4].shape, na[3:9, 4].shape)
コード例 #19
0
ファイル: test_ssvd.py プロジェクト: rossparks/spartan
def benchmark_ssvd(ctx, timer):
    DIM = (1280, 1280)
    #A = expr.randn(*DIM, dtype=np.float64)
    A = np.random.randn(*DIM)
    A = expr.from_numpy(A)
    t1 = datetime.now()
    U, S, VT = svd(A)
    t2 = datetime.now()
    cost_time = millis(t1, t2)

    print "total cost time:%s ms" % (cost_time)
コード例 #20
0
    def test_del_dim(self):
        na = np.arange(100).reshape(10, 10)
        a = expr.from_numpy(na)

        Assert.all_eq(na[2:7, 8], a[2:7, 8].glom())
        Assert.all_eq(na[3:9, 4].shape, a[3:9, 4].shape)

        Assert.all_eq(na[2:7, -1], a[2:7, -1].glom())
        Assert.all_eq(na[-1, 3:9].shape, a[-1, 3:9].shape)

        util.log_info('\na.shape: %s \nna.shape %s', a[3:9, 4].shape,
                      na[3:9, 4].shape)
コード例 #21
0
ファイル: test_pca.py プロジェクト: rgardner/spartan
def benchmark_pca(ctx, timer):
  DIM = (1280, 512)
  data = np.random.randn(*DIM)
  A = expr.from_numpy(data)
  #A = expr.randn(*DIM, dtype=np.float64)
  t1 = datetime.now()
  m = PCA(N_COMPONENTS)
  m.fit(A)
  t2 = datetime.now()
  cost_time = millis(t1, t2)
    
  print "total cost time:%s ms" % (cost_time)
コード例 #22
0
ファイル: test_pca.py プロジェクト: rgardner/spartan
  def test_pca(self):
    FLAGS.opt_parakeet_gen = 0
    data = np.random.randn(*DIM)
    A = expr.from_numpy(data, tile_hint=util.calc_tile_hint(DIM, axis=0))
    
    m = PCA(N_COMPONENTS)
    m2 = SK_PCA(N_COMPONENTS)

    m.fit(A)
    m2.fit(data)
    print m2.components_ - m.components_
    assert np.allclose(absolute(m.components_), absolute(m2.components_))
コード例 #23
0
ファイル: test_pca.py プロジェクト: rossparks/spartan
    def test_pca(self):
        FLAGS.opt_parakeet_gen = 0
        data = np.random.randn(*DIM)
        A = expr.from_numpy(data, tile_hint=util.calc_tile_hint(DIM, axis=0))

        m = PCA(N_COMPONENTS)
        m2 = SK_PCA(N_COMPONENTS)

        m.fit(A)
        m2.fit(data)
        print m2.components_ - m.components_
        assert np.allclose(absolute(m.components_), absolute(m2.components_))
コード例 #24
0
ファイル: test_pca.py プロジェクト: rossparks/spartan
def benchmark_pca(ctx, timer):
    DIM = (1280, 512)
    data = np.random.randn(*DIM)
    A = expr.from_numpy(data)
    #A = expr.randn(*DIM, dtype=np.float64)
    t1 = datetime.now()
    m = PCA(N_COMPONENTS)
    m.fit(A)
    t2 = datetime.now()
    cost_time = millis(t1, t2)

    print "total cost time:%s ms" % (cost_time)
コード例 #25
0
ファイル: test_reshape.py プロジェクト: rossparks/spartan
    def test_reshape_dot(self):
        npa1 = np.random.random((357, 93))
        npa2 = np.random.random((31, 357))
        result = np.dot(np.reshape(npa1, (1071, 31)), npa2)

        t1 = expr.from_numpy(npa1)
        t2 = expr.from_numpy(npa2)
        t3 = expr.dot(expr.reshape(t1, (1071, 31)), t2)
        Assert.all_eq(result, t3.glom(), 10e-9)

        npa1 = np.random.random((357, 718))
        npa2 = np.random.random((718, ))
        result = np.dot(npa1, np.reshape(npa2, (718, 1)))

        t1 = expr.from_numpy(npa1)
        t2 = expr.from_numpy(npa2)
        t3 = expr.dot(t1, expr.reshape(t2, (718, 1)))
        Assert.all_eq(result, t3.glom(), 10e-9)

        npa1 = np.random.random((718, ))
        npa2 = np.random.random((1, 357))
        result = np.dot(np.reshape(npa1, (718, 1)), npa2)

        t1 = expr.from_numpy(npa1)
        t2 = expr.from_numpy(npa2)
        t3 = expr.dot(expr.reshape(t1, (718, 1)), t2)
        Assert.all_eq(result, t3.glom(), 10e-9)
コード例 #26
0
ファイル: test_reshape.py プロジェクト: MaggieQi/spartan
  def test_reshape_dot(self):
    npa1 = np.random.random((357, 93))
    npa2 = np.random.random((31, 357))
    result = np.dot(np.reshape(npa1, (1071, 31)), npa2)

    t1 = expr.from_numpy(npa1)
    t2 = expr.from_numpy(npa2)
    t3 = expr.dot(expr.reshape(t1, (1071, 31)), t2)
    Assert.all_eq(result, t3.glom(), 10e-9)

    npa1 = np.random.random((357, 718))
    npa2 = np.random.random((718, ))
    result = np.dot(npa1, np.reshape(npa2, (718, 1)))

    t1 = expr.from_numpy(npa1)
    t2 = expr.from_numpy(npa2)
    t3 = expr.dot(t1, expr.reshape(t2, (718, 1)))
    Assert.all_eq(result, t3.glom(), 10e-9)

    npa1 = np.random.random((718, ))
    npa2 = np.random.random((1, 357))
    result = np.dot(np.reshape(npa1, (718, 1)), npa2)

    t1 = expr.from_numpy(npa1)
    t2 = expr.from_numpy(npa2)
    t3 = expr.dot(expr.reshape(t1, (718, 1)), t2)
    Assert.all_eq(result, t3.glom(), 10e-9)
コード例 #27
0
ファイル: test_optimization.py プロジェクト: MaggieQi/spartan
  def test_optimization_reduced(self):
    na = np.random.rand(1000, 1000)
    nb = np.random.rand(1000, 1000)
    a = expr.from_numpy(na)
    b = expr.from_numpy(nb)

    c = a - b
    d = a + c
    f = c[200:900, 200:900]
    g = d[200:900, 200:900]
    h = f - g
    i = f + h
    j = h[100:500, 100:500]
    k = i[100:500, 100:500]
    l = expr.dot(j, k)
    m = j + k
    n = k - l
    o = n - m
    q = n + o
    r = q - m
    s = expr.sum(r)

    nc = na - nb
    nd = na + nc
    nf = nc[200:900, 200:900]
    ng = nd[200:900, 200:900]
    nh = nf - ng
    ni = nf + nh
    nj = nh[100:500, 100:500]
    nk = ni[100:500, 100:500]
    nl = np.dot(nj, nk)
    nm = nj + nk
    nn = nk - nl
    no = nn - nm
    nq = nn + no
    nr = nq - nm
    ns = np.sum(nr)

    # Our sum seems to reduce precision
    Assert.all_eq(ns, s.optimized().glom(), tolerance = 1e-6)
コード例 #28
0
    def test_optimization_reduced(self):
        na = np.random.rand(1000, 1000)
        nb = np.random.rand(1000, 1000)
        a = expr.from_numpy(na)
        b = expr.from_numpy(nb)

        c = a - b
        d = a + c
        f = c[200:900, 200:900]
        g = d[200:900, 200:900]
        h = f - g
        i = f + h
        j = h[100:500, 100:500]
        k = i[100:500, 100:500]
        l = expr.dot(j, k)
        m = j + k
        n = k - l
        o = n - m
        q = n + o
        r = q - m
        s = expr.sum(r)

        nc = na - nb
        nd = na + nc
        nf = nc[200:900, 200:900]
        ng = nd[200:900, 200:900]
        nh = nf - ng
        ni = nf + nh
        nj = nh[100:500, 100:500]
        nk = ni[100:500, 100:500]
        nl = np.dot(nj, nk)
        nm = nj + nk
        nn = nk - nl
        no = nn - nm
        nq = nn + no
        nr = nq - nm
        ns = np.sum(nr)

        # Our sum seems to reduce precision
        Assert.all_eq(ns, s.optimized().glom(), tolerance=1e-6)
コード例 #29
0
def benchmark_qr(ctx, timer):
    M = 1280
    N = 1280
    Y = np.random.randn(M, N)
    Y = expr.from_numpy(Y)
    #Y = expr.randn(M, N)

    t1 = datetime.now()
    Q, R = qr(Y)
    t2 = datetime.now()
    cost_time = millis(t1, t2)

    print "total cost time:%s ms" % (cost_time)
コード例 #30
0
  def test_optimization_shape(self):
    shape = (200, 800)
    na = np.arange(np.prod(shape), dtype=np.int).reshape(shape)
    nb = np.random.randint(1, 1000, (1000, 1000))
    nc = np.random.randint(1, 1000, (1000, 1000))
    a = expr.arange(shape, dtype=np.int)
    b = expr.from_numpy(nb)
    c = expr.from_numpy(nc)

    d = b + c
    e = b + d
    f = d[200:900, 200:900]
    g = e[200:900, 200:900]
    h = f + g
    i = f + h
    j = h[100:500, 100:500]
    k = i[100:300, 100:300]
    l = expr.reshape(expr.ravel(j), (800, 200))
    m = expr.dot(a, l)
    n = m + k
    o = n + m 
    q = o[100:200, 100:200]

    nd = nb + nc
    ne = nb + nd
    nf = nd[200:900, 200:900]
    ng = ne[200:900, 200:900]
    nh = nf + ng
    ni = nf + nh
    nj = nh[100:500, 100:500]
    nk = ni[100:300, 100:300]
    nl = np.reshape(np.ravel(nj), (800, 200))
    nm = np.dot(na, nl)
    nn = nm + nk
    no = nn + nm 
    nq = no[100:200, 100:200]


    Assert.all_eq(nq, q.optimized().glom(), tolerance = 1e-10)
コード例 #31
0
ファイル: test_qr.py プロジェクト: MaggieQi/spartan
def benchmark_qr(ctx, timer):
  M = 1280
  N = 1280
  Y = np.random.randn(M, N)
  Y = expr.from_numpy(Y)
  #Y = expr.randn(M, N)

  t1 = datetime.now()
  Q, R = qr(Y)
  t2 = datetime.now()
  cost_time = millis(t1, t2)
    
  print "total cost time:%s ms" % (cost_time)
コード例 #32
0
ファイル: test_sort.py プロジェクト: rossparks/spartan
  def test_ndimension(self):
    for case in xrange(5):
      dim = np.random.randint(low=2, high=6)
      shape = np.random.randint(low=5, high=11, size=dim)
      util.log_info('Test Case #%s: DIM(%s) shape%s', case + 1, dim, shape)

      na = new_ndarray(shape)
      a = expr.from_numpy(na)

      for axis in xrange(dim):
        Assert.all_eq(expr.sort(a, axis).glom(),
                      np.sort(na, axis))
        Assert.all_eq(expr.argsort(a, axis).glom(),
                      np.argsort(na, axis))
コード例 #33
0
ファイル: test_newaxis.py プロジェクト: GabrielWen/spartan
  def test_combo(self):
    na = np.arange(100).reshape(10, 10)
    a = expr.from_numpy(na)

    Assert.all_eq(na[np.newaxis, 2:7, 4],
                  a[expr.newaxis, 2:7, 4].glom())
    Assert.all_eq(na[2:7, np.newaxis, -1],
                  a[2:7, expr.newaxis, -1].glom())
    Assert.all_eq(na[-1, np.newaxis, 2:7],
                  a[-1, expr.newaxis, 2:7].glom())
    Assert.all_eq(na[np.newaxis, 2:7, np.newaxis, np.newaxis, 4, np.newaxis, np.newaxis],
                  a[expr.newaxis, 2:7, expr.newaxis, expr.newaxis, 4, expr.newaxis, expr.newaxis].glom())

    util.log_info('\na.shape:  %s \nna.shape: %s',
		a[expr.newaxis, 2:7, expr.newaxis, expr.newaxis, -1, expr.newaxis, expr.newaxis].shape,
                na[np.newaxis, 2:7, np.newaxis, np.newaxis, -1, np.newaxis, np.newaxis].shape)
コード例 #34
0
    def test_combo(self):
        na = np.arange(100).reshape(10, 10)
        a = expr.from_numpy(na)

        Assert.all_eq(na[np.newaxis, 2:7, 4], a[expr.newaxis, 2:7, 4].glom())
        Assert.all_eq(na[2:7, np.newaxis, -1], a[2:7, expr.newaxis, -1].glom())
        Assert.all_eq(na[-1, np.newaxis, 2:7], a[-1, expr.newaxis, 2:7].glom())
        Assert.all_eq(
            na[np.newaxis, 2:7, np.newaxis, np.newaxis, 4, np.newaxis,
               np.newaxis], a[expr.newaxis, 2:7, expr.newaxis, expr.newaxis, 4,
                              expr.newaxis, expr.newaxis].glom())

        util.log_info(
            '\na.shape:  %s \nna.shape: %s',
            a[expr.newaxis, 2:7, expr.newaxis, expr.newaxis, -1, expr.newaxis,
              expr.newaxis].shape, na[np.newaxis, 2:7, np.newaxis, np.newaxis,
                                      -1, np.newaxis, np.newaxis].shape)
コード例 #35
0
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'):
    '''
  clustering data points using kmeans spectral clustering method.

  Args:
    points(Expr or DistArray): the data points to be clustered.
    k(int): the number of clusters we need to generate.
    num_iter(int): the max number of iterations that kmeans clustering method runs. 
    similarity_measurement(str): distance method used to measure similarity between two points.
  '''
    # calculate similarity for each pair of points to generate the adjacency matrix A
    A = expr.shuffle(points,
                     _row_similarity_mapper,
                     kw={'similarity_measurement': similarity_measurement},
                     shape_hint=(points.shape[0], points.shape[0]))

    num_dims = A.shape[1]

    # Construct the diagonal matrix D
    D = expr.sum(A, axis=1, tile_hint=(A.shape[0], ))

    # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
    L = expr.shuffle(A, _laplacian_mapper, kw={'D': D}, shape_hint=A.shape)

    # Perform eigen-decomposition using Lanczos solver
    overshoot = min(k * 2, num_dims)
    d, U = lanczos.solve(L, L, overshoot, True)
    U = U[:, 0:k]

    # Generate initial clusters which picks rows as centers if that row contains max eigen
    # value in that column
    init_clusters = U[np.argmax(U, axis=0)]

    # Run kmeans clustering with init_clusters
    kmeans = KMeans(k, num_iter)
    U = expr.from_numpy(U)
    centers, labels = kmeans.fit(U, init_clusters)

    return labels
コード例 #36
0
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'):
  '''
  clustering data points using kmeans spectral clustering method.

  Args:
    points(Expr or DistArray): the data points to be clustered.
    k(int): the number of clusters we need to generate.
    num_iter(int): the max number of iterations that kmeans clustering method runs. 
    similarity_measurement(str): distance method used to measure similarity between two points.
  '''  
  # calculate similarity for each pair of points to generate the adjacency matrix A
  A = expr.shuffle(points, _row_similarity_mapper, kw={'similarity_measurement': similarity_measurement})
  
  num_dims = A.shape[1]
  
  # Construct the diagonal matrix D
  D = expr.sum(A, axis=1, tile_hint=(A.shape[0],))
  
  # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
  L = expr.shuffle(A, _laplacian_mapper, kw={'D': D})
  
  # Perform eigen-decomposition using Lanczos solver
  overshoot = min(k * 2, num_dims) 
  d, U = lanczos.solve(L, L, overshoot, True)
  U = U[:, 0:k]
  
  # Generate initial clusters which picks rows as centers if that row contains max eigen 
  # value in that column
  init_clusters = U[np.argmax(U, axis=0)]
  
  # Run kmeans clustering with init_clusters
  kmeans = KMeans(k, num_iter)
  U = expr.from_numpy(U)
  centers, labels = kmeans.fit(U, init_clusters)
  
  return labels
コード例 #37
0
ファイル: test_assign.py プロジェクト: muddimedia/spartan-1
    def test_assign_expr(self):
        # Small matrix
        a = np.random.randn(20, 10)
        b = np.random.randn(10)
        region_a = np.s_[10, ]
        sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom()
        a[region_a] = b
        Assert.all_eq(sp_a, a)

        # Larger matrix
        a = np.random.randn(200, 100)
        b = np.random.randn(100)
        region_a = np.s_[50, ]
        sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom()
        a[region_a] = b
        Assert.all_eq(sp_a, a)

        # Worst case region
        a = np.random.randn(200, 100)
        b = np.random.randn(3, 50)
        region_a = np.s_[99:102, 25:75]
        sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom()
        a[region_a] = b
        Assert.all_eq(sp_a, a)
コード例 #38
0
ファイル: test_assign.py プロジェクト: GabrielWen/spartan
  def test_assign_expr(self):
    # Small matrix
    a = np.random.randn(20, 10)
    b = np.random.randn(10)
    region_a = np.s_[10, ]
    sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom()
    a[region_a] = b
    Assert.all_eq(sp_a, a)

    # Larger matrix
    a = np.random.randn(200, 100)
    b = np.random.randn(100)
    region_a = np.s_[50, ]
    sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom()
    a[region_a] = b
    Assert.all_eq(sp_a, a)

    # Worst case region
    a = np.random.randn(200, 100)
    b = np.random.randn(3, 50)
    region_a = np.s_[99:102, 25:75]
    sp_a = assign(from_numpy(a), region_a, from_numpy(b)).glom()
    a[region_a] = b
    Assert.all_eq(sp_a, a)
コード例 #39
0
ファイル: lanczos.py プロジェクト: muddimedia/spartan-1
def solve(A, AT, desired_rank, is_symmetric=False):
  '''
  A simple implementation of the Lanczos algorithm
  (http://en.wikipedia.org/wiki/Lanczos_algorithm) for eigenvalue computation.

  Like the Mahout implementation, only the matrix*vector step is parallelized.
  
  First we use lanczos method to turn the matrix into tridiagonoal form. Then
  we use numpy.linalg.eig function to extract the eigenvalues and eigenvectors 
  from the tridiagnonal matrix(desired_rank*desired_rank). Since desired_rank 
  should be smaller than the size of matrix, so we could it in local machine 
  efficiently. 
  '''
  # Calculate two more eigenvalues, but we only keep the largest desired_rank
  # one. Doing this to keep the result consistent with scipy.sparse.linalg.svds.
  desired_rank += 2

  n = A.shape[1]
  v_next = np.ones(n) / np.sqrt(n)
  v_prev = np.zeros(n)
  beta = np.zeros(desired_rank+1)
  beta[0] = 0
  alpha = np.zeros(desired_rank)

  # Since the disiredRank << size of matrix, so we keep
  # V in local memory for efficiency reason(It needs to be updated
  # for every iteration). 
  # If the case which V can't be fit in local memory occurs, 
  # you could turn it into spartan distributed array. 
  V = np.zeros((n, desired_rank))


  for i in range(0, desired_rank):
    util.log_info("Iter : %s", i)
    v_next_expr = expr.from_numpy(v_next.reshape(n, 1))

    if is_symmetric:
      w = expr.dot(A, v_next_expr).optimized().glom().reshape(n)
    else:
      w = expr.dot(A, v_next_expr)
      w = expr.dot(AT, w).optimized().glom().reshape(n)

    alpha[i] = np.dot(w, v_next)
    w = w - alpha[i] * v_next - beta[i] * v_prev
    
    # Orthogonalize:
    for t in range(i):
      tmpa = np.dot(w, V[:, t])
      if tmpa == 0.0:
        continue
      w -= tmpa * V[:, t] 

    beta[i+1] = np.linalg.norm(w, 2) 
    v_prev = v_next
    v_next = w / beta[i+1]
    V[:, i] = v_prev
  
  # Create tridiag matrix with size (desired_rank X desired_rank)  
  tridiag = np.diag(alpha)
  for i in range(0, desired_rank-1):
    tridiag[i, i+1] = beta[i+1] 
    tridiag[i+1, i] = beta[i+1]
  
  # Get eigenvectors and eigenvalues of this tridiagonal matrix.  
  # The eigenvalues of this tridiagnoal matrix equals to the eigenvalues
  # of matrix dot(A, A.T.). We can get the eigenvectors of dot(A, A.T) 
  # by multiplying V with eigenvectors of this tridiagonal matrix.
  d, v = np.linalg.eig(tridiag) 
  
  # Sort eigenvalues and their corresponding eigenvectors 
  sorted_idx = np.argsort(np.absolute(d))[::-1]
  d = d[sorted_idx]
  v = v[:, sorted_idx]
  
  # Get the eigenvetors of dot(A, A.T)
  s = np.dot(V, v)
  return d[0:desired_rank-2], s[:, 0:desired_rank-2] 
コード例 #40
0
ファイル: unsupervised.py プロジェクト: GabrielWen/spartan
  def fit(self, X):
    if isinstance(X, np.ndarray):
      X = expr.from_numpy(X)

    self.X = X
    return self
コード例 #41
0
    def kneighbors(self, X, n_neighbors=None):
        """Finds the K-neighbors of a point.

        Returns distance

        Parameters
        ----------
        X : array-like, last dimension same as that of fit data
            The new point.

        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).

        Returns
        -------
        dist : array
            Array representing the lengths to point, only present if
            return_distance=True

        ind : array
            Indices of the nearest points in the population matrix.
    """
        if n_neighbors is not None:
            self.n_neighbors = n_neighbors

        if isinstance(X, np.ndarray):
            X = expr.from_numpy(X)

        if self.algorithm in ('auto', 'brute'):
            X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
            fit_X_broadcast = expr.reshape(
                self.X, (1, self.X.shape[0], self.X.shape[1]))
            distances = expr.sum((X_broadcast - fit_X_broadcast)**2, axis=2)
            neigh_ind = expr.argsort(distances, axis=1)
            neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom()
            neigh_dist = expr.sort(distances, axis=1)
            neigh_dist = expr.sqrt(
                neigh_dist[:, :n_neighbors]).optimized().glom()
            return neigh_dist, neigh_ind
        else:
            results = self.X.foreach_tile(mapper_fn=_knn_mapper,
                                          kw={
                                              'X': self.X,
                                              'Q': X,
                                              'n_neighbors': self.n_neighbors,
                                              'algorithm': self.algorithm
                                          })
            dist = None
            ind = None
            """ Get the KNN candidates for each tile of X, then find out the real KNN """
            for k, v in results.iteritems():
                if dist is None:
                    dist = v[0]
                    ind = v[1]
                else:
                    dist = np.concatenate((dist, v[0]), axis=1)
                    ind = np.concatenate((ind, v[1]), axis=1)

            mask = np.argsort(dist, axis=1)[:, :self.n_neighbors]
            new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)])
            new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)])
            return new_dist, new_ind
コード例 #42
0
ファイル: k_means_.py プロジェクト: rossparks/spartan
    def fit(self, X, centers=None, implementation='map2'):
        """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
        num_dim = X.shape[1]
        num_points = X.shape[0]

        labels = expr.zeros((num_points, 1), dtype=np.int)

        if implementation == 'map2':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.map2(X,
                                   0,
                                   fn=kmeans_map2_dist_mapper,
                                   fn_kw={"centers": centers},
                                   shape=(X.shape[0], ))

                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
            return centers, labels

        elif implementation == 'outer':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.outer((X, centers), (0, None),
                                    fn=kmeans_outer_dist_mapper,
                                    shape=(X.shape[0], ))
                #labels = expr.argmin(distances, axis=1)
                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'broadcast':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                util.log_warn("k_means_ %d %d", i, time.time())
                X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
                centers_broadcast = expr.reshape(
                    centers, (1, centers.shape[0], centers.shape[1]))
                distances = expr.sum(expr.square(X_broadcast -
                                                 centers_broadcast),
                                     axis=2)
                labels = expr.argmin(distances, axis=1)
                center_idx = expr.arange((1, centers.shape[0]))
                matches = expr.reshape(labels,
                                       (labels.shape[0], 1)) == center_idx
                matches = matches.astype(np.int64)
                counts = expr.sum(matches, axis=0)
                centers = expr.sum(
                    X_broadcast *
                    expr.reshape(matches,
                                 (matches.shape[0], matches.shape[1], 1)),
                    axis=0)

                counts = counts.optimized().glom()
                centers = centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'shuffle':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                # Reset them to zero.
                new_centers = expr.ndarray((self.n_clusters, num_dim),
                                           reduce_fn=lambda a, b: a + b)
                new_counts = expr.ndarray((self.n_clusters, 1),
                                          dtype=np.int,
                                          reduce_fn=lambda a, b: a + b)

                _ = expr.shuffle(X,
                                 _find_cluster_mapper,
                                 kw={
                                     'd_pts': X,
                                     'old_centers': centers,
                                     'new_centers': new_centers,
                                     'new_counts': new_counts,
                                     'labels': labels
                                 },
                                 shape_hint=(1, ),
                                 cost_hint={
                                     hash(labels): {
                                         '00': 0,
                                         '01': np.prod(labels.shape)
                                     }
                                 })
                _.force()

                new_counts = new_counts.glom()
                new_centers = new_centers.glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (new_counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    new_counts[zcount_indices] = 1
                    new_centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                new_centers = new_centers / new_counts
                centers = new_centers

            return centers, labels
コード例 #43
0
    def fit(self, X):
        if isinstance(X, np.ndarray):
            X = expr.from_numpy(X)

        self.X = X
        return self
コード例 #44
0
ファイル: k_means_.py プロジェクト: rgardner/spartan
  def fit(self, X, centers=None, implementation='outer'):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if implementation == 'map2':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers},
                           shape=(X.shape[0], ))

        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
      return centers, labels

    elif implementation == 'outer':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper,
                            shape=(X.shape[0],))
        #labels = expr.argmin(distances, axis=1)
        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'broadcast':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        util.log_warn("k_means_ %d %d", i, time.time())
        X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
        centers_broadcast = expr.reshape(centers, (1, centers.shape[0],
                                                   centers.shape[1]))
        distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
        labels = expr.argmin(distances, axis=1)
        center_idx = expr.arange((1, centers.shape[0]))
        matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
        matches = matches.astype(np.int64)
        counts = expr.sum(matches, axis=0)
        centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                                matches.shape[1], 1)),
                           axis=0)

        counts = counts.optimized().glom()
        centers = centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'shuffle':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        # Reset them to zero.
        new_centers = expr.ndarray((self.n_clusters, num_dim),
                                   reduce_fn=lambda a, b: a + b)
        new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int,
                                  reduce_fn=lambda a, b: a + b)

        _ = expr.shuffle(X,
                         _find_cluster_mapper,
                         kw={'d_pts': X,
                             'old_centers': centers,
                             'new_centers': new_centers,
                             'new_counts': new_counts,
                             'labels': labels},
                         shape_hint=(1,),
                         cost_hint={hash(labels): {'00': 0,
                                                   '01': np.prod(labels.shape)}})
        _.force()

        new_counts = new_counts.glom()
        new_centers = new_centers.glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (new_counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          new_counts[zcount_indices] = 1
          new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        new_centers = new_centers / new_counts
        centers = new_centers

      return centers, labels