Ejemplo n.º 1
0
  def train_smo_1998(self, data, labels):
    '''
    Train an SVM model using the SMO (1998) algorithm.
   
    Args:
      data(Expr): points to be trained
      labels(Expr): the correct labels of the training data
    '''
    
    N = data.shape[0] # Number of instances
    D = data.shape[1]  # Number of features

    self.b = 0.0
    self.alpha = expr.zeros((N,1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]).force()
    
    # linear kernel
    kernel_results = expr.dot(data, expr.transpose(data), tile_hint=[N/self.ctx.num_workers, N])   
    
    labels = expr.force(labels)
    self.E = expr.zeros((N,1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]).force()
    for i in xrange(N):
      self.E[i, 0] = self.b + expr.reduce(self.alpha, axis=None, dtype_fn=lambda input: input.dtype,
                                          local_reduce_fn=margin_mapper,
                                          accumulate_fn=np.add, 
                                          fn_kw=dict(label=labels, data=kernel_results[:,i].force())).glom() - labels[i, 0]
    
    util.log_info("Starting SMO")
    it = 0
    num_changed = 0
    examine_all = True
    while (num_changed > 0 or examine_all) and (it < self.maxiter):
      util.log_info("Iteration:%d", it)

      num_changed = 0
      
      if examine_all:
        for i in xrange(N): 
          num_changed += self.examine_example(i, N, labels, kernel_results)
      else:
        for i in xrange(N):
          if self.alpha[i, 0] > 0 and self.alpha[i, 0] < self.C:
            num_changed += self.examine_example(i, N, labels, kernel_results)

      it += 1

      if examine_all: examine_all = False
      elif num_changed == 0: examine_all = True
    
    self.w = expr.zeros((D, 1), dtype=np.float64).force()
    for i in xrange(D): 
      self.w[i,0] = expr.reduce(self.alpha, axis=None, dtype_fn=lambda input: input.dtype,
                              local_reduce_fn=margin_mapper,
                              accumulate_fn=np.add, 
                              fn_kw=dict(label=labels, data=expr.force(data[:,i]))).glom()
    self.usew_ = True
    print 'iteration finish:', it
    print 'b:', self.b
    print 'w:', self.w.glom()
Ejemplo n.º 2
0
def random_galaxy(n):
    '''Generate a galaxy of random bodies.'''
    dtype = np.float  # consistent with sp.rand, same as np.float64

    galaxy = {  # All bodies stand still initially.
        'm': (rand(n) + dtype(10)) * dtype(m_sol / 10),
        'x': (rand(n) - dtype(0.5)) * dtype(r_ly / 100),
        'y': (rand(n) - dtype(0.5)) * dtype(r_ly / 100),
        'z': (rand(n) - dtype(0.5)) * dtype(r_ly / 100),
        'vx': zeros((n, )),
        'vy': zeros((n, )),
        'vz': zeros((n, ))
    }
    return galaxy
Ejemplo n.º 3
0
def random_galaxy(n):
  '''Generate a galaxy of random bodies.'''
  dtype = np.float  # consistent with sp.rand, same as np.float64

  galaxy = {  # All bodies stand still initially.
      'm': (rand(n) + dtype(10)) * dtype(m_sol/10),
      'x': (rand(n) - dtype(0.5)) * dtype(r_ly/100),
      'y': (rand(n) - dtype(0.5)) * dtype(r_ly/100),
      'z': (rand(n) - dtype(0.5)) * dtype(r_ly/100),
      'vx': zeros((n, )),
      'vy': zeros((n, )),
      'vz': zeros((n, ))
      }
  return galaxy
Ejemplo n.º 4
0
  def precompute(self):
    '''Precompute the most k similar items for each item.

    After this funcion returns. 2 attributes will be created.

    Attributes
    ------
    top_k_similar_table : Numpy array of shape (N, k). 
                          Records the most k similar scores between each items. 
    top_k_similar_indices : Numpy array of shape (N, k).
                            Records the indices of most k similar items for each item.
    '''
    M = self.rating_table.shape[0]
    N = self.rating_table.shape[1]
    self.rating_table = expr.force(self.rating_table)
    
    assert self.rating_table.tile_shape()[0] == M, \
           "rating table is only allowed to tile by columns!"

    self.similarity_table = expr.zeros(shape=(N, N), 
                                       tile_hint=(self.rating_table.tile_shape()[1], N)).force() 

    self.item_norm = self._get_norm_of_each_item(self.rating_table) 

    self.rating_table.foreach_tile(mapper_fn=_similarity_mapper,
                                   kw={'rating_table' : self.rating_table,
                                       'similarity_table' : self.similarity_table,
                                       'item_norm' : self.item_norm,
                                       'step' : self.rating_table.tile_shape()[1]})

    # Release the memory for item_norm
    self.item_norm = None
    k = self.k
    top_k_similar_table = expr.zeros((N, k), 
                                      tile_hint=(self.rating_table.tile_shape()[1], k)).force()

    top_k_similar_indices = expr.zeros((N, k), 
                                        tile_hint=(self.rating_table.tile_shape()[1], k), 
                                                   dtype=np.int).force()
    
    # Find top-k similar items for each item.
    # Store the similarity scores into table top_k_similar table.
    # Store the indices of top k items into table top_k_similar_indices.
    self.similarity_table.foreach_tile(mapper_fn=_select_most_k_similar_mapper,
                                       kw={'similarity_table' : self.similarity_table,
                                           'top_k_similar_table' : top_k_similar_table,
                                           'top_k_similar_indices' : top_k_similar_indices,
                                           'k' : k})
    self.top_k_similar_table = top_k_similar_table.glom()
    self.top_k_similar_indices = top_k_similar_indices.glom()
Ejemplo n.º 5
0
def benchmark_optimization(ctx, timer):
  FLAGS.optimization = 0
  DATA_SIZE = 5 * 1000 * 1000
  current = eager(zeros((DATA_SIZE * ctx.num_workers,),
                        dtype=np.float32, tile_hint = (DATA_SIZE,)))
  strike = eager(ones((DATA_SIZE * ctx.num_workers,),
                      dtype=np.float32, tile_hint=(DATA_SIZE,)))
  maturity = eager(strike * 12)
  rate = eager(strike * 0.05)
  volatility = eager(strike * 0.01)

  timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility))
  timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility))
  timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility))

  FLAGS.optimization = 1
  FLAGS.opt_parakeet_gen = 0
  FLAGS.opt_map_fusion = 1
  timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility))
  timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility))
  timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility))

  FLAGS.opt_parakeet_gen = 1
  timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility))
  timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility))
  timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility))
Ejemplo n.º 6
0
def cgit(A, x):
  '''
  CGIT Conjugate Gradient iteration
  z = cgit(A, x) generates approximate solution to A*z = x.
  
  Args:
  A(Expr): matrix to be processed.
  x(Expr): the input vector.
  '''
  z = expr.zeros(x.shape, tile_hint=(A.tile_shape()[1], 1))
  r = x
  rho = expr.sum(r * r).glom()
  #util.log_warn('rho:%s', rho)
  p = r
  
  for i in xrange(15):
    q = expr.dot(A, p, tile_hint=(A.tile_shape()[1], 1))
    alpha = rho / expr.sum(p * q).glom()
    #util.log_warn('alpha:%s', alpha)
    z = z + p * alpha
    rho0 = rho
    r = r - q * alpha
    rho = expr.sum(r * r).glom()
    beta = rho / rho0
    #util.log_warn('beta:%s', beta)
    p = r + p * beta
  
  return z
Ejemplo n.º 7
0
def jacobi_method(A, b, _iter=100):
    """
  Iterative algorithm for approximating the solutions of a diagonally dominant system of linear equations. 

  Parameters
  ----------
  A : ndarray or Expr - 2d
      Input matrix
  b : ndarray or Expr - vector
      RHS vector
  _iter : int
      Times of iteration needed, default to be 100

 Returns
  -------
  result : Expr - vector
      Approximated solution.
  """
    util.Assert.eq(A.shape[0], b.shape[0])

    x = expr.zeros((A.shape[0],))

    D = expr.diag(A)
    R = A - expr.diagflat(D)

    for i in xrange(_iter):
        x = (b - expr.dot(R, x)) / D

    return x
Ejemplo n.º 8
0
def jacobi_method(A, b, _iter=100):
    """
  Iterative algorithm for approximating the solutions of a diagonally dominant system of linear equations. 

  Parameters
  ----------
  A : ndarray or Expr - 2d
      Input matrix
  b : ndarray or Expr - vector
      RHS vector
  _iter : int
      Times of iteration needed, default to be 100

 Returns
  -------
  result : Expr - vector
      Approximated solution.
  """
    util.Assert.eq(A.shape[0], b.shape[0])

    x = expr.zeros((A.shape[0], ))

    D = expr.diag(A)
    R = A - expr.diagflat(D)

    for i in xrange(_iter):
        x = (b - expr.dot(R, x)) / D

    return x
Ejemplo n.º 9
0
def cgit(A, x):
    '''
  CGIT Conjugate Gradient iteration
  z = cgit(A, x) generates approximate solution to A*z = x.
  
  Args:
  A(Expr): matrix to be processed.
  x(Expr): the input vector.
  '''
    z = expr.zeros(x.shape)
    r = x
    rho = expr.sum(r * r).optimized().glom()
    #util.log_warn('rho:%s', rho)
    p = r

    for i in xrange(15):
        q = expr.dot(A, p)
        alpha = rho / expr.sum(p * q).optimized().glom()
        #util.log_warn('alpha:%s', alpha)
        z = z + p * alpha
        rho0 = rho
        r = r - q * alpha
        rho = expr.sum(r * r).optimized().glom()
        beta = rho / rho0
        #util.log_warn('beta:%s', beta)
        p = r + p * beta

    return z
Ejemplo n.º 10
0
  def precompute(self):
    '''Precompute the most k similar items for each item.

    After this funcion returns. 2 attributes will be created.

    Attributes
    ------
    top_k_similar_table : Numpy array of shape (N, k). 
                          Records the most k similar scores between each items. 
    top_k_similar_indices : Numpy array of shape (N, k).
                            Records the indices of most k similar items for each item.
    '''
    M = self.rating_table.shape[0]
    N = self.rating_table.shape[1]

    self.similarity_table = expr.shuffle(self.rating_table, _similarity_mapper, 
                                         kw={'item_norm': self._get_norm_of_each_item(self.rating_table), 
                                             'step': util.divup(self.rating_table.shape[1], blob_ctx.get().num_workers)}, 
                                         shape_hint=(N, N))

    # Release the memory for item_norm
    top_k_similar_indices = expr.zeros((N, self.k), dtype=np.int)
    
    # Find top-k similar items for each item.
    # Store the similarity scores into table top_k_similar table.
    # Store the indices of top k items into table top_k_similar_indices.
    cost = np.prod(top_k_similar_indices.shape)
    top_k_similar_table = expr.shuffle(self.similarity_table, _select_most_k_similar_mapper, 
                                       kw = {'top_k_similar_indices': top_k_similar_indices, 'k': self.k}, 
                                       shape_hint=(N, self.k), 
                                       cost_hint={hash(top_k_similar_indices):{'00': 0, '01': cost, '10': cost, '11': cost}})
    self.top_k_similar_table = top_k_similar_table.optimized().glom()
    self.top_k_similar_indices = top_k_similar_indices.optimized().glom()
Ejemplo n.º 11
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
  '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
  points = expr.force(points)
  num_dim = points.shape[1]
  if centers is None:
      centers = expr.rand(k, num_dim)

  labels = expr.zeros((points.shape[0],), dtype=np.int)

  for iter in range(num_iter):
    centers = expr.as_array(centers)
    points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1]))
    centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1]))
    distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2)
    # This is used to avoid dividing zero
    distances = distances + 0.00000000001
    util.log_info('distances shape %s' % str(distances.shape))
    distances_broadcast = expr.reshape(distances, (distances.shape[0], 1,
                                                   distances.shape[1]))
    distances_broadcast2 = expr.reshape(distances, (distances.shape[0],
                                                    distances.shape[1], 1))
    prob = 1.0 / expr.sum(expr.power(distances_broadcast / distances_broadcast2,
                                     2.0 / (m - 1)), axis=2)
    prob.force()
    counts = expr.sum(prob, axis=0)
    counts = expr.reshape(counts, (counts.shape[0], 1))
    labels = expr.argmax(prob, axis=1)
    centers = expr.sum(expr.reshape(points, (points.shape[0], 1, points.shape[1])) *
                       expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)),
                       axis=0)

    # We assume that the size of centers are relative small that can be handled
    # on the master.
    counts = counts.glom()
    centers = centers.glom()
    # If any centroids don't have any points assigned to them.
    zcount_indices = (counts == 0).reshape(k)

    if np.any(zcount_indices):
      # One or more centroids may not have any points assigned to them, which results in their
      # position being the zero-vector.  We reseed these centroids with new random values
      # and set their counts to 1 in order to get rid of dividing by zero.
      counts[zcount_indices, :] = 1
      centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices),
                                                  num_dim)

    centers = centers / counts
  return labels
Ejemplo n.º 12
0
def benchmark_optimization(ctx, timer):
    FLAGS.optimization = 0
    DATA_SIZE = 5 * 1000 * 1000
    current = eager(
        zeros((DATA_SIZE * ctx.num_workers, ),
              dtype=np.float32,
              tile_hint=(DATA_SIZE, )))
    strike = eager(
        ones((DATA_SIZE * ctx.num_workers, ),
             dtype=np.float32,
             tile_hint=(DATA_SIZE, )))
    maturity = eager(strike * 12)
    rate = eager(strike * 0.05)
    volatility = eager(strike * 0.01)

    timer.time_op('opt-none',
                  lambda: bs_step(current, strike, maturity, rate, volatility))
    timer.time_op('opt-none',
                  lambda: bs_step(current, strike, maturity, rate, volatility))
    timer.time_op('opt-none',
                  lambda: bs_step(current, strike, maturity, rate, volatility))

    FLAGS.optimization = 1
    FLAGS.opt_parakeet_gen = 0
    FLAGS.opt_map_fusion = 1
    timer.time_op('opt-fusion',
                  lambda: bs_step(current, strike, maturity, rate, volatility))
    timer.time_op('opt-fusion',
                  lambda: bs_step(current, strike, maturity, rate, volatility))
    timer.time_op('opt-fusion',
                  lambda: bs_step(current, strike, maturity, rate, volatility))

    FLAGS.opt_parakeet_gen = 1
    timer.time_op('opt-parakeet',
                  lambda: bs_step(current, strike, maturity, rate, volatility))
    timer.time_op('opt-parakeet',
                  lambda: bs_step(current, strike, maturity, rate, volatility))
    timer.time_op('opt-parakeet',
                  lambda: bs_step(current, strike, maturity, rate, volatility))

    FLAGS.opt_parakeet_gen = 0
    FLAGS.opt_auto_tiling = 0
    timer.time_op('opt-tiling = 0',
                  lambda: bs_step(current, strike, maturity, rate, volatility))
    timer.time_op('opt-tiling = 0',
                  lambda: bs_step(current, strike, maturity, rate, volatility))
    timer.time_op('opt-tiling = 0',
                  lambda: bs_step(current, strike, maturity, rate, volatility))

    FLAGS.opt_auto_tiling = 1
    timer.time_op('opt-tiling',
                  lambda: bs_step(current, strike, maturity, rate, volatility))
    timer.time_op('opt-tiling',
                  lambda: bs_step(current, strike, maturity, rate, volatility))
    timer.time_op('opt-tiling',
                  lambda: bs_step(current, strike, maturity, rate, volatility))
Ejemplo n.º 13
0
def benchmark_slice(ctx, timer):
  TEST_SIZE = 1000 * ctx.num_workers
  
  # force arange to evaluate first.
  x = expr.eager(expr.zeros((TEST_SIZE,10000)))

  for i in range(5): 
    timer.time_op('slice-rows', lambda: expr.evaluate(x[200:300, :].sum()))
    timer.time_op('slice-cols', lambda: expr.evaluate(x[:, 200:300].sum()))
    timer.time_op('slice-box', lambda: expr.evaluate(x[200:300, 200:300].sum()))
Ejemplo n.º 14
0
def benchmark_slice(ctx, timer):
    TEST_SIZE = 1000 * ctx.num_workers

    # force arange to evaluate first.
    x = expr.eager(expr.zeros((TEST_SIZE, 10000)))

    for i in range(5):
        timer.time_op('slice-rows', lambda: expr.evaluate(x[200:300, :].sum()))
        timer.time_op('slice-cols', lambda: expr.evaluate(x[:, 200:300].sum()))
        timer.time_op('slice-box',
                      lambda: expr.evaluate(x[200:300, 200:300].sum()))
Ejemplo n.º 15
0
def fit(data, labels, T=50, la=1.0):
  '''
  Train an SVM model using the disdca (2013) algorithm.
 
  Args:
    data(Expr): points to be trained.
    labels(Expr): the correct labels of the training data.
    T(int): max training iterations.
    la(float): lambda parameter of this SVM model.
  '''
  w = expr.zeros((data.shape[1], 1), dtype=np.float64)
  alpha = expr.zeros((data.shape[0], 1), dtype=np.float64)
  for i in range(T):
    alpha = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)),
                         _svm_mapper, 
                         kw={'labels': labels, 'alpha': alpha, 'w': w, 'lambda_n': la * data.shape[0]},
                         shape_hint=alpha.shape, 
                         cost_hint={ hash(labels) : {'00': 0, '01': np.prod(labels.shape)}, hash(alpha) : {'00': 0, '01': np.prod(alpha.shape)} })
    w = expr.sum(data * alpha * 1.0 / la / data.shape[0], axis=0).reshape((data.shape[1], 1))
    w = w.optimized()
  return w
Ejemplo n.º 16
0
    def precompute(self):
        '''Precompute the most k similar items for each item.

    After this funcion returns. 2 attributes will be created.

    Attributes
    ------
    top_k_similar_table : Numpy array of shape (N, k). 
                          Records the most k similar scores between each items. 
    top_k_similar_indices : Numpy array of shape (N, k).
                            Records the indices of most k similar items for each item.
    '''
        M = self.rating_table.shape[0]
        N = self.rating_table.shape[1]

        self.similarity_table = expr.shuffle(
            self.rating_table,
            _similarity_mapper,
            kw={
                'item_norm':
                self._get_norm_of_each_item(self.rating_table),
                'step':
                util.divup(self.rating_table.shape[1],
                           blob_ctx.get().num_workers)
            },
            shape_hint=(N, N))

        # Release the memory for item_norm
        top_k_similar_indices = expr.zeros((N, self.k), dtype=np.int)

        # Find top-k similar items for each item.
        # Store the similarity scores into table top_k_similar table.
        # Store the indices of top k items into table top_k_similar_indices.
        cost = np.prod(top_k_similar_indices.shape)
        top_k_similar_table = expr.shuffle(self.similarity_table,
                                           _select_most_k_similar_mapper,
                                           kw={
                                               'top_k_similar_indices':
                                               top_k_similar_indices,
                                               'k': self.k
                                           },
                                           shape_hint=(N, self.k),
                                           cost_hint={
                                               hash(top_k_similar_indices): {
                                                   '00': 0,
                                                   '01': cost,
                                                   '10': cost,
                                                   '11': cost
                                               }
                                           })
        self.top_k_similar_table = top_k_similar_table.optimized().glom()
        self.top_k_similar_indices = top_k_similar_indices.optimized().glom()
Ejemplo n.º 17
0
  def fit(self, X, centers = None):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    X = expr.force(X)
    num_dim = X.shape[1]
    labels = expr.zeros((X.shape[0],1), dtype=np.int, tile_hint=X.tile_shape())
  
    if centers is None:
      centers = np.random.rand(self.n_clusters, num_dim)
    
    for i in range(self.n_iter):
      # Reset them to zero.
      new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b)
      new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b)
      
      _ = expr.shuffle(X,
                        _find_cluster_mapper,
                        kw={'d_pts' : X,
                            'old_centers' : centers,
                            'new_centers' : new_centers,
                            'new_counts' : new_counts,
                            'labels': labels
                            })
      _.force()

      new_counts = new_counts.glom()
      new_centers = new_centers.glom()
      
      # If any centroids don't have any points assigined to them.
      zcount_indices = (new_counts == 0).reshape(self.n_clusters)
      
      if np.any(zcount_indices):
        # One or more centroids may not have any points assigned to them,
        # which results in their position being the zero-vector.  We reseed these
        # centroids with new random values.
        n_points = np.count_nonzero(zcount_indices)
        # In order to get rid of dividing by zero.
        new_counts[zcount_indices] = 1
        new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

      new_centers = new_centers / new_counts
      centers = new_centers

    return centers, labels
Ejemplo n.º 18
0
  def fit(self, X, centers=None):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if centers is None:
      centers = expr.from_numpy(np.random.rand(self.n_clusters, num_dim))

    for i in range(self.n_iter):
      X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
      centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1]))
      distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
      labels = expr.argmin(distances, axis=1)
      center_idx = expr.arange((1, centers.shape[0]))
      matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
      matches = matches.astype(np.int64)
      counts = expr.sum(matches, axis=0)
      centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                              matches.shape[1], 1)),
                         axis=0)

      counts = counts.optimized().glom()
      centers = centers.optimized().glom()

      # If any centroids don't have any points assigined to them.
      zcount_indices = (counts == 0).reshape(self.n_clusters)

      if np.any(zcount_indices):
        # One or more centroids may not have any points assigned to them,
        # which results in their position being the zero-vector.  We reseed these
        # centroids with new random values.
        n_points = np.count_nonzero(zcount_indices)
        # In order to get rid of dividing by zero.
        counts[zcount_indices] = 1
        centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

      centers = centers / counts.reshape(centers.shape[0], 1)
      centers = expr.from_numpy(centers)
    return centers, labels	

    '''
Ejemplo n.º 19
0
def center_data(X, y, fit_intercept, normalize=False):
  """
  Centers data to have mean zero along axis 0. This is here because
  nearly all linear models will want their data to be centered.
  """
  if fit_intercept:
    X_mean = X.mean(axis = 0)
    X_mean = expr.reshape(X_mean, (1, X_mean.shape[0]))
    X -= X_mean
    
    if normalize:
      X_std = expr.sqrt(expr.sum(X ** 2, axis=0)).force()
      X_std[X_std == 0] = 1
      X /= X_std
    else:
      X_std = expr.ones(X.shape[1])
    
    y_mean = y.mean(axis=0)
    y -= y_mean
  else:
    X_mean = expr.zeros(X.shape[1])
    X_std = expr.ones(X.shape[1])
    y_mean = 0. if y.ndim == 1 else expr.zeros(y.shape[1], dtype=X.dtype)
  return X, y, X_mean, y_mean, X_std
Ejemplo n.º 20
0
def fit(data, labels, num_tiles, T=50, la=1.0):
  '''
  Train an SVM model using the disdca (2013) algorithm.
 
  Args:
    data(Expr): points to be trained.
    labels(Expr): the correct labels of the training data.
    num_tiles(int): the total tiles of the training data.
    T(int): max training iterations.
    la(float): lambda parameter of this SVM model.
  '''
  w = None
  m = data.shape[0] / num_tiles
  alpha = expr.zeros((m * num_tiles, 1), dtype=np.float64, tile_hint=(m,1)).force()
  for i in range(T):
    new_weight = expr.ndarray((data.shape[1], 1), dtype=np.float64, reduce_fn=np.add, tile_hint=(data.shape[1], 1))
    new_weight = expr.shuffle(data, _svm_mapper, target=new_weight, kw={'labels': labels, 'alpha': alpha, 'w': w, 'm': m, 'scale': num_tiles, 'lambda_n': la * data.shape[0]})
    w = new_weight / num_tiles
  return w
Ejemplo n.º 21
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
  '''
  clustering data points using fuzzy kmeans clustering method.
  
  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans. 
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
  points = expr.force(points)
  num_dim = points.shape[1]
  if centers is None:
      centers = expr.rand(k, num_dim, tile_hint=(k, num_dim))
  
  labels = expr.zeros((points.shape[0],), dtype=np.int, tile_hint=(points.shape[0]/len(points.tiles),))
  for iter in range(num_iter):
    new_centers = expr.ndarray((k, num_dim), reduce_fn=lambda a, b: a + b, tile_hint=(k, num_dim))
    new_counts = expr.ndarray((k, 1), dtype=np.float, reduce_fn=lambda a, b: a + b, tile_hint=(k, 1))
    expr.shuffle(points, _fuzzy_kmeans_mapper, kw={'old_centers': centers, 
                                                   'centers': new_centers, 
                                                   'counts': new_counts, 
                                                   'labels': labels, 
                                                   'm': m}).force()
    
    # If any centroids don't have any points assigned to them.
    zcount_indices = (new_counts.glom() == 0).reshape(k)
      
    if np.any(zcount_indices):
      # One or more centroids may not have any points assigned to them, which results in their
      # position being the zero-vector.  We reseed these centroids with new random values
      # and set their counts to 1 in order to get rid of dividing by zero.
      new_counts[zcount_indices, :] = 1
      new_centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices), num_dim)
        
    centers = new_centers / new_counts
    
  return labels
Ejemplo n.º 22
0
def simulate(ts_all, te_all, lamb_all, num_paths):
  '''Range over a number of independent products.

  :param ts_all: DistArray
    Start dates for a series of swaptions.
  :param te_all: DistArray
    End dates for a series of swaptions.
  :param lamb_all: DistArray
    Parameter values for a series of swaptions.
  :param num_paths: Int
    Number of paths used in random walk.

  :rtype: DistArray

  '''
  swaptions = []
  i = 0
  for ts_a, te, lamb in zip(ts_all, te_all, lamb_all):
    for ts in ts_a:
      #start = time()
      print i
      time_structure = arange(None, 0, ts + DELTA, DELTA)
      maturity_structure = arange(None, 0, te, DELTA)

      ############# MODEL ###############
      # Variance reduction technique - Antithetic Variates.
      eps_tmp = randn(time_structure.shape[0] - 1, num_paths)
      eps = concatenate(eps_tmp, -eps_tmp, 1)

      # Forward LIBOR rates for the construction of the spot measure.
      f_kk = zeros((time_structure.shape[0], 2*num_paths))
      f_kk = assign(f_kk, np.s_[0, :], F_0)

      # Plane kxN of simulated LIBOR rates.
      f_kn = ones((maturity_structure.shape[0], 2*num_paths))*F_0

      # Simulations of the plane f_kn for each time step.
      for t in xrange(1, time_structure.shape[0]):
        f_kn_new = f_kn[1:, :]*exp(lamb*mu(f_kn, lamb)*DELTA-0.5*lamb*lamb *
            DELTA + lamb*eps[t - 1, :]*sqrt(DELTA))
        f_kk = assign(f_kk, np.s_[t, :], f_kn_new[0])
        f_kn = f_kn_new

      ############## PRODUCT ###############
      # Value of zero coupon bonds.
      zcb = ones((int((te-ts)/DELTA)+1, 2*num_paths))
      f_kn_modified = 1 + DELTA*f_kn
      for j in xrange(zcb.shape[0] - 1):
        zcb = assign(zcb, np.s_[j + 1], zcb[j] / f_kn_modified[j])

      # Swaption price at maturity.
      last_row = zcb[zcb.shape[0] - 1, :].reshape((20, ))
      swap_ts = maximum(1 - last_row - THETA*DELTA*expr.sum(zcb[1:], 0), 0)

      # Spot measure used for discounting.
      b_ts = ones((2*num_paths, ))
      tmp = 1 + DELTA * f_kk
      for j in xrange(int(ts/DELTA)):
        b_ts *= tmp[j].reshape((20, ))

      # Swaption price at time 0.
      swaption = swap_ts/b_ts

      # Save expected value in bps and std.
      me = mean((swaption[0:num_paths] + swaption[num_paths:])/2) * 10000
      st = std((swaption[0:num_paths] + swaption[num_paths:])/2)/sqrt(num_paths)*10000

      swaptions.append([me.optimized().force(), st.optimized().force()])
      #print time() - start
      i += 1
  return swaptions
Ejemplo n.º 23
0
 def test_count_zero(self):
     x = expr.ones((TEST_SIZE, ))
     Assert.eq(expr.count_zero(x).glom(), 0)
     x = expr.zeros((TEST_SIZE, ))
     Assert.eq(expr.count_zero(x).glom(), TEST_SIZE)
Ejemplo n.º 24
0
    def train_smo_1998(self, data, labels):
        '''
    Train an SVM model using the SMO (1998) algorithm.
   
    Args:
      data(Expr): points to be trained
      labels(Expr): the correct labels of the training data
    '''

        N = data.shape[0]  # Number of instances
        D = data.shape[1]  # Number of features

        self.b = 0.0
        self.alpha = expr.zeros((N, 1),
                                dtype=np.float64,
                                tile_hint=[N / self.ctx.num_workers,
                                           1]).force()

        # linear kernel
        kernel_results = expr.dot(data,
                                  expr.transpose(data),
                                  tile_hint=[N / self.ctx.num_workers, N])

        labels = expr.force(labels)
        self.E = expr.zeros((N, 1),
                            dtype=np.float64,
                            tile_hint=[N / self.ctx.num_workers, 1]).force()
        for i in xrange(N):
            self.E[i, 0] = self.b + expr.reduce(
                self.alpha,
                axis=None,
                dtype_fn=lambda input: input.dtype,
                local_reduce_fn=margin_mapper,
                accumulate_fn=np.add,
                fn_kw=dict(
                    label=labels,
                    data=kernel_results[:, i].force())).glom() - labels[i, 0]

        util.log_info("Starting SMO")
        it = 0
        num_changed = 0
        examine_all = True
        while (num_changed > 0 or examine_all) and (it < self.maxiter):
            util.log_info("Iteration:%d", it)

            num_changed = 0

            if examine_all:
                for i in xrange(N):
                    num_changed += self.examine_example(
                        i, N, labels, kernel_results)
            else:
                for i in xrange(N):
                    if self.alpha[i, 0] > 0 and self.alpha[i, 0] < self.C:
                        num_changed += self.examine_example(
                            i, N, labels, kernel_results)

            it += 1

            if examine_all: examine_all = False
            elif num_changed == 0: examine_all = True

        self.w = expr.zeros((D, 1), dtype=np.float64).force()
        for i in xrange(D):
            self.w[i, 0] = expr.reduce(self.alpha,
                                       axis=None,
                                       dtype_fn=lambda input: input.dtype,
                                       local_reduce_fn=margin_mapper,
                                       accumulate_fn=np.add,
                                       fn_kw=dict(label=labels,
                                                  data=expr.force(
                                                      data[:, i]))).glom()
        self.usew_ = True
        print 'iteration finish:', it
        print 'b:', self.b
        print 'w:', self.w.glom()
Ejemplo n.º 25
0
  def train_smo_2005(self, data, labels):
    '''
    Train an SVM model using the SMO (2005) algorithm.
   
    Args:
      data(Expr): points to be trained
      labels(Expr): the correct labels of the training data
    '''
    
    N = data.shape[0] # Number of instances
    D = data.shape[1]  # Number of features

    self.b = 0.0
    alpha = expr.zeros((N,1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]).force()
    
    # linear kernel
    kernel_results = expr.dot(data, expr.transpose(data), tile_hint=[N/self.ctx.num_workers, N])
    gradient = expr.ones((N, 1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]) * -1.0
    
    expr_labels = expr.lazify(labels)
    
    util.log_info("Starting SMO")
    pv1 = pv2 = -1
    it = 0
    while it < self.maxiter:
      util.log_info("Iteration:%d", it)
      
      minObj = 1e100
      
      expr_alpha = expr.lazify(alpha)
      G = expr.multiply(labels, gradient) * -1.0

      v1_mask = ((expr_labels > self.tol) * (expr_alpha < self.C) + (expr_labels < -self.tol) * (expr_alpha > self.tol))
      v1 = expr.argmax(G[v1_mask-True]).glom().item()
      maxG = G[v1,0].glom()
      print 'maxv1:', v1, 'maxG:', maxG

      v2_mask = ((expr_labels > self.tol) * (expr_alpha > self.tol) + (expr_labels < -self.tol) * (expr_alpha < self.C))     
      min_v2 = expr.argmin(G[v2_mask-True]).glom().item()
      minG = G[min_v2,0].glom()
      #print 'minv2:', min_v2, 'minG:', minG
      
      set_v2 = v2_mask.glom().nonzero()[0]
      #print 'actives:', set_v2.shape[0]
      v2 = -1
      for v in set_v2:
        b = maxG - G[v,0].glom()
        if b > self.tol:
          na = (kernel_results[v1,v1] + kernel_results[v,v] - 2*kernel_results[v1,v]).glom()[0][0]
          if na < self.tol: na = 1e12
          
          obj = -(b*b)/na
          if obj <= minObj and v1 != pv1 or v != pv2:
            v2 = v
            a = na
            minObj = obj
      
      if v2 == -1: break
      if maxG - minG < self.tol: break
      
      print 'opt v1:', v1, 'v2:', v2

      pv1 = v1
      pv2 = v2
    
      y1 = labels[v1,0]
      y2 = labels[v2,0]    
        
      oldA1 = alpha[v1,0]
      oldA2 = alpha[v2,0]
      
      # Calculate new alpha values, to reduce the objective function...
      b = y2*expr.glom(gradient[v2,0]) - y1*expr.glom(gradient[v1,0])
      if y1 != y2:
        a += 4 * kernel_results[v1,v2].glom()
      
      newA1 = oldA1 + y1*b/a
      newA2 = oldA2 - y2*b/a   

      # Correct for alpha being out of range...
      sum = y1*oldA1 + y2*oldA2;
  
      if newA1 < self.tol: newA1 = 0.0
      elif newA1 > self.C: newA1 = self.C
     
      newA2 = y2 * (sum - y1 * newA1) 

      if newA2 < self.tol: newA2 = 0.0;
      elif newA2 > self.C: newA2 = self.C
     
      newA1 = y1 * (sum - y2 * newA2)
  
      # Update the gradient...
      dA1 = newA1 - oldA1
      dA2 = newA2 - oldA2
  
      gradient += expr.multiply(labels, kernel_results[:,v1]) * y1 * dA1 + expr.multiply(labels, kernel_results[:,v2]) * y2 * dA2

      alpha[v1,0] = newA1
      alpha[v2,0] = newA2
 
      #print 'alpha:', alpha.glom().T
      
      it += 1
      #print 'gradient:', gradient.glom().T

    self.w = expr.zeros((D, 1), dtype=np.float64).force()
    for i in xrange(D): 
      self.w[i,0] = expr.reduce(alpha, axis=None, dtype_fn=lambda input: input.dtype,
                                local_reduce_fn=margin_mapper,
                                accumulate_fn=np.add, 
                                fn_kw=dict(label=labels, data=expr.force(data[:,i]))).glom()
    
    self.b = 0.0
    E = (labels - self.margins(data)).force()
    
    minB = -1e100
    maxB = 1e100
    actualB = 0.0
    numActualB = 0
    
    for i in xrange(N):
      ai = alpha[i,0]
      yi = labels[i,0]
      Ei = E[i,0]
      
      if ai < 1e-3:
        if yi < self.tol:
          maxB = min((maxB,Ei))
        else:
          minB = max((minB,Ei))
      elif ai > self.C - 1e-3:
        if yi < self.tol:
          minB = max((minB,Ei))
        else:
          maxB = min((maxB,Ei))
      else:
        numActualB += 1
        actualB += (Ei - actualB) / float(numActualB)
    if numActualB > 0:
      self.b = actualB
    else:
      self.b = 0.5*(minB + maxB)

    self.usew_ = True
    print 'iteration finish:', it
    print 'b:', self.b
    print 'w:', self.w.glom()
Ejemplo n.º 26
0
    def train_smo_2005(self, data, labels):
        '''
    Train an SVM model using the SMO (2005) algorithm.
   
    Args:
      data(Expr): points to be trained
      labels(Expr): the correct labels of the training data
    '''

        N = data.shape[0]  # Number of instances
        D = data.shape[1]  # Number of features

        self.b = 0.0
        alpha = expr.zeros((N, 1),
                           dtype=np.float64,
                           tile_hint=[N / self.ctx.num_workers, 1]).force()

        # linear kernel
        kernel_results = expr.dot(data,
                                  expr.transpose(data),
                                  tile_hint=[N / self.ctx.num_workers, N])
        gradient = expr.ones(
            (N, 1), dtype=np.float64, tile_hint=[N / self.ctx.num_workers, 1
                                                 ]) * -1.0

        expr_labels = expr.lazify(labels)

        util.log_info("Starting SMO")
        pv1 = pv2 = -1
        it = 0
        while it < self.maxiter:
            util.log_info("Iteration:%d", it)

            minObj = 1e100

            expr_alpha = expr.lazify(alpha)
            G = expr.multiply(labels, gradient) * -1.0

            v1_mask = ((expr_labels > self.tol) * (expr_alpha < self.C) +
                       (expr_labels < -self.tol) * (expr_alpha > self.tol))
            v1 = expr.argmax(G[v1_mask - True]).glom().item()
            maxG = G[v1, 0].glom()
            print 'maxv1:', v1, 'maxG:', maxG

            v2_mask = ((expr_labels > self.tol) * (expr_alpha > self.tol) +
                       (expr_labels < -self.tol) * (expr_alpha < self.C))
            min_v2 = expr.argmin(G[v2_mask - True]).glom().item()
            minG = G[min_v2, 0].glom()
            #print 'minv2:', min_v2, 'minG:', minG

            set_v2 = v2_mask.glom().nonzero()[0]
            #print 'actives:', set_v2.shape[0]
            v2 = -1
            for v in set_v2:
                b = maxG - G[v, 0].glom()
                if b > self.tol:
                    na = (kernel_results[v1, v1] + kernel_results[v, v] -
                          2 * kernel_results[v1, v]).glom()[0][0]
                    if na < self.tol: na = 1e12

                    obj = -(b * b) / na
                    if obj <= minObj and v1 != pv1 or v != pv2:
                        v2 = v
                        a = na
                        minObj = obj

            if v2 == -1: break
            if maxG - minG < self.tol: break

            print 'opt v1:', v1, 'v2:', v2

            pv1 = v1
            pv2 = v2

            y1 = labels[v1, 0]
            y2 = labels[v2, 0]

            oldA1 = alpha[v1, 0]
            oldA2 = alpha[v2, 0]

            # Calculate new alpha values, to reduce the objective function...
            b = y2 * expr.glom(gradient[v2, 0]) - y1 * expr.glom(gradient[v1,
                                                                          0])
            if y1 != y2:
                a += 4 * kernel_results[v1, v2].glom()

            newA1 = oldA1 + y1 * b / a
            newA2 = oldA2 - y2 * b / a

            # Correct for alpha being out of range...
            sum = y1 * oldA1 + y2 * oldA2

            if newA1 < self.tol: newA1 = 0.0
            elif newA1 > self.C: newA1 = self.C

            newA2 = y2 * (sum - y1 * newA1)

            if newA2 < self.tol: newA2 = 0.0
            elif newA2 > self.C: newA2 = self.C

            newA1 = y1 * (sum - y2 * newA2)

            # Update the gradient...
            dA1 = newA1 - oldA1
            dA2 = newA2 - oldA2

            gradient += expr.multiply(
                labels, kernel_results[:, v1]) * y1 * dA1 + expr.multiply(
                    labels, kernel_results[:, v2]) * y2 * dA2

            alpha[v1, 0] = newA1
            alpha[v2, 0] = newA2

            #print 'alpha:', alpha.glom().T

            it += 1
            #print 'gradient:', gradient.glom().T

        self.w = expr.zeros((D, 1), dtype=np.float64).force()
        for i in xrange(D):
            self.w[i, 0] = expr.reduce(alpha,
                                       axis=None,
                                       dtype_fn=lambda input: input.dtype,
                                       local_reduce_fn=margin_mapper,
                                       accumulate_fn=np.add,
                                       fn_kw=dict(label=labels,
                                                  data=expr.force(
                                                      data[:, i]))).glom()

        self.b = 0.0
        E = (labels - self.margins(data)).force()

        minB = -1e100
        maxB = 1e100
        actualB = 0.0
        numActualB = 0

        for i in xrange(N):
            ai = alpha[i, 0]
            yi = labels[i, 0]
            Ei = E[i, 0]

            if ai < 1e-3:
                if yi < self.tol:
                    maxB = min((maxB, Ei))
                else:
                    minB = max((minB, Ei))
            elif ai > self.C - 1e-3:
                if yi < self.tol:
                    minB = max((minB, Ei))
                else:
                    maxB = min((maxB, Ei))
            else:
                numActualB += 1
                actualB += (Ei - actualB) / float(numActualB)
        if numActualB > 0:
            self.b = actualB
        else:
            self.b = 0.5 * (minB + maxB)

        self.usew_ = True
        print 'iteration finish:', it
        print 'b:', self.b
        print 'w:', self.w.glom()
Ejemplo n.º 27
0
  def fit(self, X, centers=None, implementation='outer'):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if implementation == 'map2':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers},
                           shape=(X.shape[0], ))

        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
      return centers, labels

    elif implementation == 'outer':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper,
                            shape=(X.shape[0],))
        #labels = expr.argmin(distances, axis=1)
        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'broadcast':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        util.log_warn("k_means_ %d %d", i, time.time())
        X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
        centers_broadcast = expr.reshape(centers, (1, centers.shape[0],
                                                   centers.shape[1]))
        distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
        labels = expr.argmin(distances, axis=1)
        center_idx = expr.arange((1, centers.shape[0]))
        matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
        matches = matches.astype(np.int64)
        counts = expr.sum(matches, axis=0)
        centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                                matches.shape[1], 1)),
                           axis=0)

        counts = counts.optimized().glom()
        centers = centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'shuffle':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        # Reset them to zero.
        new_centers = expr.ndarray((self.n_clusters, num_dim),
                                   reduce_fn=lambda a, b: a + b)
        new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int,
                                  reduce_fn=lambda a, b: a + b)

        _ = expr.shuffle(X,
                         _find_cluster_mapper,
                         kw={'d_pts': X,
                             'old_centers': centers,
                             'new_centers': new_centers,
                             'new_counts': new_counts,
                             'labels': labels},
                         shape_hint=(1,),
                         cost_hint={hash(labels): {'00': 0,
                                                   '01': np.prod(labels.shape)}})
        _.force()

        new_counts = new_counts.glom()
        new_centers = new_centers.glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (new_counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          new_counts[zcount_indices] = 1
          new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        new_centers = new_centers / new_counts
        centers = new_centers

      return centers, labels
Ejemplo n.º 28
0
 def test_count_nonzero(self):
   x = expr.ones((TEST_SIZE,))
   Assert.eq(expr.count_nonzero(x).glom(), TEST_SIZE)
   x = expr.zeros((TEST_SIZE,))
   Assert.eq(expr.count_nonzero(x).glom(), 0)
Ejemplo n.º 29
0
def simulate(ts_all, te_all, lamb_all, num_paths):
    """Range over a number of independent products.

  :param ts_all: DistArray
    Start dates for a series of swaptions.
  :param te_all: DistArray
    End dates for a series of swaptions.
  :param lamb_all: DistArray
    Parameter values for a series of swaptions.
  :param num_paths: Int
    Number of paths used in random walk.

  :rtype: DistArray

  """
    swaptions = []
    i = 0
    for ts_a, te, lamb in zip(ts_all, te_all, lamb_all):
        for ts in ts_a:
            # start = time()
            print i
            time_structure = arange(None, 0, ts + DELTA, DELTA)
            maturity_structure = arange(None, 0, te, DELTA)

            ############# MODEL ###############
            # Variance reduction technique - Antithetic Variates.
            eps_tmp = randn(time_structure.shape[0] - 1, num_paths)
            eps = concatenate(eps_tmp, -eps_tmp, 1)

            # Forward LIBOR rates for the construction of the spot measure.
            f_kk = zeros((time_structure.shape[0], 2 * num_paths))
            f_kk = assign(f_kk, np.s_[0, :], F_0)

            # Plane kxN of simulated LIBOR rates.
            f_kn = ones((maturity_structure.shape[0], 2 * num_paths)) * F_0

            # Simulations of the plane f_kn for each time step.
            for t in xrange(1, time_structure.shape[0]):
                f_kn_new = f_kn[1:, :] * exp(
                    lamb * mu(f_kn, lamb) * DELTA - 0.5 * lamb * lamb * DELTA + lamb * eps[t - 1, :] * sqrt(DELTA)
                )
                f_kk = assign(f_kk, np.s_[t, :], f_kn_new[0])
                f_kn = f_kn_new

            ############## PRODUCT ###############
            # Value of zero coupon bonds.
            zcb = ones((int((te - ts) / DELTA) + 1, 2 * num_paths))
            f_kn_modified = 1 + DELTA * f_kn
            for j in xrange(zcb.shape[0] - 1):
                zcb = assign(zcb, np.s_[j + 1], zcb[j] / f_kn_modified[j])

            # Swaption price at maturity.
            last_row = zcb[zcb.shape[0] - 1, :].reshape((20,))
            swap_ts = maximum(1 - last_row - THETA * DELTA * expr.sum(zcb[1:], 0), 0)

            # Spot measure used for discounting.
            b_ts = ones((2 * num_paths,))
            tmp = 1 + DELTA * f_kk
            for j in xrange(int(ts / DELTA)):
                b_ts *= tmp[j].reshape((20,))

            # Swaption price at time 0.
            swaption = swap_ts / b_ts

            # Save expected value in bps and std.
            me = mean((swaption[0:num_paths] + swaption[num_paths:]) / 2) * 10000
            st = std((swaption[0:num_paths] + swaption[num_paths:]) / 2) / sqrt(num_paths) * 10000

            swaptions.append([me.optimized().force(), st.optimized().force()])
            # print time() - start
            i += 1
    return swaptions
Ejemplo n.º 30
0
    def fit(self, X, centers=None, implementation='map2'):
        """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
        num_dim = X.shape[1]
        num_points = X.shape[0]

        labels = expr.zeros((num_points, 1), dtype=np.int)

        if implementation == 'map2':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.map2(X,
                                   0,
                                   fn=kmeans_map2_dist_mapper,
                                   fn_kw={"centers": centers},
                                   shape=(X.shape[0], ))

                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
            return centers, labels

        elif implementation == 'outer':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.outer((X, centers), (0, None),
                                    fn=kmeans_outer_dist_mapper,
                                    shape=(X.shape[0], ))
                #labels = expr.argmin(distances, axis=1)
                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'broadcast':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                util.log_warn("k_means_ %d %d", i, time.time())
                X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
                centers_broadcast = expr.reshape(
                    centers, (1, centers.shape[0], centers.shape[1]))
                distances = expr.sum(expr.square(X_broadcast -
                                                 centers_broadcast),
                                     axis=2)
                labels = expr.argmin(distances, axis=1)
                center_idx = expr.arange((1, centers.shape[0]))
                matches = expr.reshape(labels,
                                       (labels.shape[0], 1)) == center_idx
                matches = matches.astype(np.int64)
                counts = expr.sum(matches, axis=0)
                centers = expr.sum(
                    X_broadcast *
                    expr.reshape(matches,
                                 (matches.shape[0], matches.shape[1], 1)),
                    axis=0)

                counts = counts.optimized().glom()
                centers = centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'shuffle':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                # Reset them to zero.
                new_centers = expr.ndarray((self.n_clusters, num_dim),
                                           reduce_fn=lambda a, b: a + b)
                new_counts = expr.ndarray((self.n_clusters, 1),
                                          dtype=np.int,
                                          reduce_fn=lambda a, b: a + b)

                _ = expr.shuffle(X,
                                 _find_cluster_mapper,
                                 kw={
                                     'd_pts': X,
                                     'old_centers': centers,
                                     'new_centers': new_centers,
                                     'new_counts': new_counts,
                                     'labels': labels
                                 },
                                 shape_hint=(1, ),
                                 cost_hint={
                                     hash(labels): {
                                         '00': 0,
                                         '01': np.prod(labels.shape)
                                     }
                                 })
                _.force()

                new_counts = new_counts.glom()
                new_centers = new_centers.glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (new_counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    new_counts[zcount_indices] = 1
                    new_centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                new_centers = new_centers / new_counts
                centers = new_centers

            return centers, labels
Ejemplo n.º 31
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
    '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
    points = expr.force(points)
    num_dim = points.shape[1]
    if centers is None:
        centers = expr.rand(k, num_dim)

    labels = expr.zeros((points.shape[0], ), dtype=np.int)

    for iter in range(num_iter):
        centers = expr.as_array(centers)
        points_broadcast = expr.reshape(points,
                                        (points.shape[0], 1, points.shape[1]))
        centers_broadcast = expr.reshape(
            centers, (1, centers.shape[0], centers.shape[1]))
        distances = expr.sum(expr.square(points_broadcast - centers_broadcast),
                             axis=2)
        # This is used to avoid dividing zero
        distances = distances + 0.00000000001
        util.log_info('distances shape %s' % str(distances.shape))
        distances_broadcast = expr.reshape(
            distances, (distances.shape[0], 1, distances.shape[1]))
        distances_broadcast2 = expr.reshape(
            distances, (distances.shape[0], distances.shape[1], 1))
        prob = 1.0 / expr.sum(expr.power(
            distances_broadcast / distances_broadcast2, 2.0 / (m - 1)),
                              axis=2)
        prob.force()
        counts = expr.sum(prob, axis=0)
        counts = expr.reshape(counts, (counts.shape[0], 1))
        labels = expr.argmax(prob, axis=1)
        centers = expr.sum(
            expr.reshape(points, (points.shape[0], 1, points.shape[1])) *
            expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)),
            axis=0)

        # We assume that the size of centers are relative small that can be handled
        # on the master.
        counts = counts.glom()
        centers = centers.glom()
        # If any centroids don't have any points assigned to them.
        zcount_indices = (counts == 0).reshape(k)

        if np.any(zcount_indices):
            # One or more centroids may not have any points assigned to them, which results in their
            # position being the zero-vector.  We reseed these centroids with new random values
            # and set their counts to 1 in order to get rid of dividing by zero.
            counts[zcount_indices, :] = 1
            centers[zcount_indices, :] = np.random.rand(
                np.count_nonzero(zcount_indices), num_dim)

        centers = centers / counts
    return labels