Beispiel #1
0
 def __init__(self, C=1.0, tol=1e-6, maxiter=50):
   self.C = C
   self.tol = tol
   self.maxiter = maxiter
   self.usew_ = False
   self.b = 0.0
   self.ctx = blob_ctx.get()
Beispiel #2
0
 def __init__(self, C=1.0, tol=1e-6, maxiter=50):
     self.C = C
     self.tol = tol
     self.maxiter = maxiter
     self.usew_ = False
     self.b = 0.0
     self.ctx = blob_ctx.get()
Beispiel #3
0
  def _fit_transform(self, X):
    self.nbrs_.fit(X)
    self.training_data_ = self.nbrs_._fit_X 
    self.kernel_pca_ = KernelPCA(n_components=self.n_components,
                                  kernel="precomputed",
                                  eigen_solver=self.eigen_solver,
                                  tol=self.tol, max_iter=self.max_iter)
    
    kng = kneighbors_graph(self.nbrs_, self.n_neighbors, mode="distance")
    n_points = X.shape[0]
    n_workers = blob_ctx.get().num_workers

    if n_points < n_workers:
      tile_hint = (1, )
    else:
      tile_hint = (n_points / n_workers, )

    """
    task_array is used for deciding the idx of starting points and idx of endding points 
    that each tile needs to find the shortest path among.
    """
    task_array = expr.ndarray((n_points,), tile_hint=tile_hint)
    task_array = task_array.force()
    
    #dist matrix is used to hold the result
    dist_matrix = expr.ndarray((n_points, n_points), reduce_fn=lambda a,b:a+b).force()
    results = task_array.foreach_tile(mapper_fn = _shortest_path_mapper,
                                      kw = {'kng' : kng,
                                            'directed' : False,
                                            'dist_matrix' : dist_matrix})
    self.dist_matrix_ = dist_matrix.glom()
    G = self.dist_matrix_ ** 2
    G *= -0.5
    self.embedding_ = self.kernel_pca_.fit_transform(G)
Beispiel #4
0
  def fit(self, X):
    ctx = blob_ctx.get()
    if isinstance(X, np.ndarray):
      X = expr.from_numpy(X, tile_hint=(X.shape[0] / ctx.num_workers, X.shape[1]))    
    if isinstance(X, expr.Expr):
      X = X.force()

    self.X = X
    return self
Beispiel #5
0
  def test_pca(self):
    ctx = blob_ctx.get() 
    A =  expr.randn(*DIM, tile_hint=(int(DIM[0]/ctx.num_workers), DIM[1])).force()
    
    m = PCA(N_COMPONENTS)
    m2 = SK_PCA(N_COMPONENTS)

    m.fit(A)
    m2.fit(A.glom())
    assert np.allclose(absolute(m.components_), absolute(m2.components_))
  def test_ib_recommender(self):
    ctx = blob_ctx.get()

    rating_table = expr.sparse_rand((N_USERS, N_ITEMS), 
                                    dtype=np.float64, 
                                    density=0.1, 
                                    format = "csr",
                                    tile_hint=(N_USERS, N_ITEMS/ctx.num_workers))
    model = ItemBasedRecommender(rating_table)
    model.precompute()
Beispiel #7
0
    def fit(self, X, y):
        """
    Parameters
    ----------
    X : array-like of shape = [n_samples, n_features]
        The training input samples.

    y : array-like, shape = [n_samples] or [n_samples, n_outputs]
        The target values (integers that correspond to classes in
        classification, real numbers in regression).

    Returns
    -------
    self : object
        Returns self.
    """
        if isinstance(X, np.ndarray):
            X = expr.from_numpy(X)
        if isinstance(y, np.ndarray):
            y = expr.from_numpy(y)

        X = expr.force(X)
        y = expr.force(y)

        self.n_classes = np.unique(y.glom()).size
        ctx = blob_ctx.get()
        n_workers = ctx.num_workers

        _ = self._create_task_array(n_workers, self.n_estimators)
        task_array = expr.from_numpy(_, tile_hint=(1, )).force()
        target_array = expr.ndarray((task_array.shape[0], ),
                                    dtype=object,
                                    tile_hint=(1, )).force()

        results = task_array.foreach_tile(mapper_fn=_build_mapper,
                                          kw={
                                              'task_array': task_array,
                                              'target_array': target_array,
                                              'X': X,
                                              'y': y,
                                              'criterion': self.criterion,
                                              'max_depth': self.max_depth,
                                              'min_samples_split':
                                              self.min_samples_split,
                                              'min_samples_leaf':
                                              self.min_samples_leaf,
                                              'max_features':
                                              self.max_features,
                                              'bootstrap': self.bootstrap
                                          })

        # Target array stores the local random forest each worker builds,
        # it's used for further prediction.
        self.target_array = target_array
        return self
Beispiel #8
0
 def test_ssvd(self):
   ctx = blob_ctx.get() 
   # Create a sparse matrix.
   A = expr.randn(*DIM, tile_hint = (int(DIM[0]/ctx.num_workers), DIM[1]), 
                  dtype=np.float64)
   
   U,S,VT = svd(A)
   U2,S2,VT2 = linalg.svd(A.glom(), full_matrices=0)
   
   assert np.allclose(absolute(U.glom()), absolute(U2))
   assert np.allclose(absolute(S), absolute(S2))
   assert np.allclose(absolute(VT), absolute(VT2))
Beispiel #9
0
    def test_ib_recommender(self):
        ctx = blob_ctx.get()

        FLAGS.opt_auto_tiling = 0
        rating_table = expr.sparse_rand(
            (N_USERS, N_ITEMS),
            dtype=np.float64,
            density=0.1,
            format="csr",
            tile_hint=(N_USERS, N_ITEMS / ctx.num_workers))
        model = ItemBasedRecommender(rating_table)
        model.precompute()
    def precompute(self):
        '''Precompute the most k similar items for each item.

    After this funcion returns. 2 attributes will be created.

    Attributes
    ------
    top_k_similar_table : Numpy array of shape (N, k). 
                          Records the most k similar scores between each items. 
    top_k_similar_indices : Numpy array of shape (N, k).
                            Records the indices of most k similar items for each item.
    '''
        M = self.rating_table.shape[0]
        N = self.rating_table.shape[1]

        self.similarity_table = expr.shuffle(
            self.rating_table,
            _similarity_mapper,
            kw={
                'item_norm':
                self._get_norm_of_each_item(self.rating_table),
                'step':
                util.divup(self.rating_table.shape[1],
                           blob_ctx.get().num_workers)
            },
            shape_hint=(N, N))

        # Release the memory for item_norm
        top_k_similar_indices = expr.zeros((N, self.k), dtype=np.int)

        # Find top-k similar items for each item.
        # Store the similarity scores into table top_k_similar table.
        # Store the indices of top k items into table top_k_similar_indices.
        cost = np.prod(top_k_similar_indices.shape)
        top_k_similar_table = expr.shuffle(self.similarity_table,
                                           _select_most_k_similar_mapper,
                                           kw={
                                               'top_k_similar_indices':
                                               top_k_similar_indices,
                                               'k': self.k
                                           },
                                           shape_hint=(N, self.k),
                                           cost_hint={
                                               hash(top_k_similar_indices): {
                                                   '00': 0,
                                                   '01': cost,
                                                   '10': cost,
                                                   '11': cost
                                               }
                                           })
        self.top_k_similar_table = top_k_similar_table.optimized().glom()
        self.top_k_similar_indices = top_k_similar_indices.optimized().glom()
Beispiel #11
0
    def test_ssvd(self):
        ctx = blob_ctx.get()
        # Create a sparse matrix.
        A = expr.randn(*DIM,
                       tile_hint=(int(DIM[0] / ctx.num_workers), DIM[1]),
                       dtype=np.float64)

        U, S, VT = svd(A)
        U2, S2, VT2 = linalg.svd(A.glom(), full_matrices=0)

        assert np.allclose(absolute(U.glom()), absolute(U2))
        assert np.allclose(absolute(S), absolute(S2))
        assert np.allclose(absolute(VT), absolute(VT2))
Beispiel #12
0
 def test_svds(self):
   ctx = blob_ctx.get() 
   # Create a sparse matrix.
   A = expr.sparse_rand(DIM, density=1,
                         format="csr", 
                         tile_hint = (DIM[0] / ctx.num_workers, DIM[1]), 
                         dtype=np.float64)
   
   RANK = np.linalg.matrix_rank(A.glom())
   U,S,VT = svds(A, RANK)
   U2,S2,VT2 = linalg.svds(A.glom(), RANK)
   
   assert np.allclose(absolute(U), absolute(U2))
   assert np.allclose(absolute(S), absolute(S2))
   assert np.allclose(absolute(VT), absolute(VT2))
Beispiel #13
0
  def fit(self, X, y):
    """
    Parameters
    ----------
    X : array-like of shape = [n_samples, n_features]
        The training input samples.

    y : array-like, shape = [n_samples] or [n_samples, n_outputs]
        The target values (integers that correspond to classes in
        classification, real numbers in regression).

    Returns
    -------
    self : object
        Returns self.
    """
    if isinstance(X, np.ndarray):
      X = expr.from_numpy(X)
    if isinstance(y, np.ndarray):
      y = expr.from_numpy(y)

    X = X.evaluate()
    y = y.evaluate()

    self.n_classes = np.unique(y.glom()).size
    ctx = blob_ctx.get()
    n_workers = ctx.num_workers

    _ = self._create_task_array(n_workers, self.n_estimators)
    task_array = expr.from_numpy(_, tile_hint=(1, )).evaluate()
    target_array = expr.ndarray((task_array.shape[0], ), dtype=object, tile_hint=(1,)).evaluate()

    results = task_array.foreach_tile(mapper_fn=_build_mapper,
                                      kw={'task_array': task_array,
                                          'target_array': target_array,
                                          'X': X,
                                          'y': y,
                                          'criterion': self.criterion,
                                          'max_depth': self.max_depth,
                                          'min_samples_split': self.min_samples_split,
                                          'min_samples_leaf': self.min_samples_leaf,
                                          'max_features': self.max_features,
                                          'bootstrap': self.bootstrap})

    # Target array stores the local random forest each worker builds,
    # it's used for further prediction.
    self.target_array = target_array
    return self
Beispiel #14
0
 def test1(self):
   a = expr.ones(ARRAY_SIZE)
   b = expr.ones(ARRAY_SIZE)
   c = expr.ones(ARRAY_SIZE)
   x = a + b + c
   y = x + x
   z = y + y
   z = expr.checkpoint(z, mode='disk')
   z.force()
   
   failed_worker_id = 0
   ctx = blob_ctx.get()
   ctx.local_worker.mark_failed_worker(failed_worker_id)
   
   res = z + z
   Assert.all_eq(res.glom(), np.ones(ARRAY_SIZE)*24)
Beispiel #15
0
  def test1(self):
    a = expr.ones(ARRAY_SIZE)
    b = expr.ones(ARRAY_SIZE)
    c = expr.ones(ARRAY_SIZE)
    x = a + b + c
    y = x + x
    z = y + y
    z = expr.checkpoint(z, mode='disk')
    z.evaluate()

    failed_worker_id = 0
    ctx = blob_ctx.get()
    ctx.local_worker.mark_failed_worker(failed_worker_id)

    res = z + z
    Assert.all_eq(res.glom(), np.ones(ARRAY_SIZE)*24)
Beispiel #16
0
    def test_svds(self):
        ctx = blob_ctx.get()
        # Create a sparse matrix.
        A = expr.sparse_rand(DIM,
                             density=1,
                             format="csr",
                             tile_hint=(DIM[0] / ctx.num_workers, DIM[1]),
                             dtype=np.float64)

        RANK = np.linalg.matrix_rank(A.glom())
        U, S, VT = svds(A, RANK)
        U2, S2, VT2 = linalg.svds(A.glom(), RANK)

        assert np.allclose(absolute(U), absolute(U2))
        assert np.allclose(absolute(S), absolute(S2))
        assert np.allclose(absolute(VT), absolute(VT2))
Beispiel #17
0
def svd(A, k=None):
  """
  Stochastic SVD.

  Parameters
  ----------
  A : spartan matrix
      Array to compute the SVD on, of shape (M, N)
  k : int, optional
      Number of singular values and vectors to compute.

  The operations include matrix multiplication and QR decomposition.
  We parallelize both of them.

  Returns
  --------
  U : Spartan array of shape (M, k)
  S : numpy array of shape (k,)
  V : numpy array of shape (k, k)
  """
  if k is None:
    k = A.shape[1]

  ctx = blob_ctx.get()
  Omega = expr.randn(A.shape[1], k, tile_hint=(A.shape[1]/ctx.num_workers, k))

  r = A.shape[0] / ctx.num_workers
  Y = expr.dot(A, Omega, tile_hint=(r, k)).force()
  
  Q, R = qr(Y)
  
  B = expr.dot(expr.transpose(Q), A)
  BTB = expr.dot(B, expr.transpose(B)).glom()

  S, U_ = np.linalg.eig(BTB)
  S = np.sqrt(S)

  # Sort by eigen values from large to small
  si = np.argsort(S)[::-1]
  S = S[si]
  U_ = U_[:, si]

  U = expr.dot(Q, U_).force()
  V = np.dot(np.dot(expr.transpose(B).glom(), U_), np.diag(np.ones(S.shape[0]) / S))
  return U, S, V.T 
Beispiel #18
0
  def _get_norm_of_each_item(self, rating_table):
    """Get norm of each item vector.
    For each Item, caculate the norm the item vector.
    Parameters
    ----------
    rating_table : Spartan matrix of shape(M, N). 
                   Each column represents the rating of the item.

    Returns
    ---------
    item_norm:  Spartan matrix of shape(N,).
                item_norm[i] equals || rating_table[:,i] || 

    """
    ctx = blob_ctx.get()
    if isinstance(rating_table, array.distarray.DistArray):
      rating_table = expr.lazify(rating_table)
    res = expr.sqrt(expr.sum(expr.multiply(rating_table, rating_table), axis=0, 
                             tile_hint=(rating_table.shape[1] / ctx.num_workers, )))
    return res.force()
Beispiel #19
0
  def precompute(self):
    '''Precompute the most k similar items for each item.

    After this funcion returns. 2 attributes will be created.

    Attributes
    ------
    top_k_similar_table : Numpy array of shape (N, k). 
                          Records the most k similar scores between each items. 
    top_k_similar_indices : Numpy array of shape (N, k).
                            Records the indices of most k similar items for each item.
    '''
    M = self.rating_table.shape[0]
    N = self.rating_table.shape[1]

    self.similarity_table = expr.shuffle(self.rating_table, _similarity_mapper, 
                                         kw={'item_norm': self._get_norm_of_each_item(self.rating_table), 
                                             'step': util.divup(self.rating_table.shape[1], blob_ctx.get().num_workers)}, 
                                         shape_hint=(N, N))

    # Release the memory for item_norm
    top_k_similar_indices = expr.zeros((N, self.k), dtype=np.int)
    
    # Find top-k similar items for each item.
    # Store the similarity scores into table top_k_similar table.
    # Store the indices of top k items into table top_k_similar_indices.
    cost = np.prod(top_k_similar_indices.shape)
    top_k_similar_table = expr.shuffle(self.similarity_table, _select_most_k_similar_mapper, 
                                       kw = {'top_k_similar_indices': top_k_similar_indices, 'k': self.k}, 
                                       shape_hint=(N, self.k), 
                                       cost_hint={hash(top_k_similar_indices):{'00': 0, '01': cost, '10': cost, '11': cost}})
    self.top_k_similar_table = top_k_similar_table.optimized().glom()
    self.top_k_similar_indices = top_k_similar_indices.optimized().glom()
Beispiel #20
0
 def test_jacobi(self):
   global base
   A, b = jacobi.jacobi_init(base * blob_ctx.get().num_workers)
   jacobi.jacobi_method(A, b, 10).glom()
Beispiel #21
0
def solve(A, AT, desired_rank, is_symmetric=False):
  '''
  A simple implementation of the Lanczos algorithm
  (http://en.wikipedia.org/wiki/Lanczos_algorithm) for eigenvalue computation.

  Like the Mahout implementation, only the matrix*vector step is parallelized.
  
  First we use lanczos method to turn the matrix into tridiagonoal form. Then
  we use numpy.linalg.eig function to extract the eigenvalues and eigenvectors 
  from the tridiagnonal matrix(desired_rank*desired_rank). Since desired_rank 
  should be smaller than the size of matrix, so we could it in local machine 
  efficiently. 
  '''
  A = expr.force(A)
  AT = expr.force(AT)
  ctx = blob_ctx.get() 
  # Calculate two more eigenvalues, but we only keep the largest desired_rank
  # one. Doing this to keep the result consistent with scipy.sparse.linalg.svds.
  desired_rank += 2

  n = A.shape[1]
  v_next = np.ones(n) / np.sqrt(n)
  v_prev = np.zeros(n)
  beta = np.zeros(desired_rank+1)
  beta[0] = 0
  alpha = np.zeros(desired_rank)

  # Since the disiredRank << size of matrix, so we keep
  # V in local memory for efficiency reason(It needs to be updated
  # for every iteration). 
  # If the case which V can't be fit in local memory occurs, 
  # you could turn it into spartan distributed array. 
  V = np.zeros((n, desired_rank))


  for i in range(0, desired_rank):
    util.log_info("Iter : %s", i)
    v_next_expr = expr.from_numpy(v_next.reshape(n, 1), tile_hint=(n/ctx.num_workers, 1))

    if is_symmetric:
      w = expr.dot(A, v_next_expr).glom().reshape(n)
    else:
      w = expr.dot(A, v_next_expr, tile_hint=(min(*A.tile_shape()), 1)).force()
      w = expr.dot(AT, w, tile_hint=(min(*A.tile_shape()), 1)).glom().reshape(n)

    alpha[i] = np.dot(w, v_next)
    w = w - alpha[i] * v_next - beta[i] * v_prev
    
    # Orthogonalize:
    for t in range(i):
      tmpa = np.dot(w, V[:, t])
      if tmpa == 0.0:
        continue
      w -= tmpa * V[:, t] 

    beta[i+1] = np.linalg.norm(w, 2) 
    v_prev = v_next
    v_next = w / beta[i+1]
    V[:, i] = v_prev
  
  # Create tridiag matrix with size (desired_rank X desired_rank)  
  tridiag = np.diag(alpha)
  for i in range(0, desired_rank-1):
    tridiag[i, i+1] = beta[i+1] 
    tridiag[i+1, i] = beta[i+1]
  
  # Get eigenvectors and eigenvalues of this tridiagonal matrix.  
  # The eigenvalues of this tridiagnoal matrix equals to the eigenvalues
  # of matrix dot(A, A.T.). We can get the eigenvectors of dot(A, A.T) 
  # by multiplying V with eigenvectors of this tridiagonal matrix.
  d, v = np.linalg.eig(tridiag) 
  
  # Sort eigenvalues and their corresponding eigenvectors 
  sorted_idx = np.argsort(np.absolute(d))[::-1]
  d = d[sorted_idx]
  v = v[:, sorted_idx]
  
  # Get the eigenvetors of dot(A, A.T)
  s = np.dot(V, v)
  return d[0:desired_rank-2], s[:, 0:desired_rank-2]