def streaming_kmeans(points, k=10, num_iters=10, num_ballkmeans_runs=2, trim_factor=0.9,
                     test_probability=0.1, correct_weight=False):
  '''
  clustering data points using streaming kmeans method.

  Args:
    points(DistArray): data points to be clustered.
    k(int): the final number of clusters.
    num_iters(int): the number of iterations to run in each ball kmeans run.
    num_ballkmeans_runs(int): the number of ball kmeans to run.
    trim_factor(float): the ball kmeans parameter to separate the nearest points and distant points.
    test_probability(float): the percentage of points to be chosen as test set.
    correct_weights(bool): whether to correct the weights of the centroids.
  '''
  centroids = expr.tile_operation(points,
                                  _streaming_mapper,
                                  kw={'k': k}).evaluate()

  new_centroids = []
  for tile_result in centroids.values():
    for centroids_list in tile_result:
      new_centroids.extend(centroids_list)

  centriods = ball_kmeans(new_centroids, k, num_iters, num_ballkmeans_runs, trim_factor,
                          test_probability, correct_weight)

  centers = np.zeros((k, points.shape[1]))
  for i in range(k):
    centers[i] = centriods[i].get_center()

  return expr.shuffle(points, _cluster_mapper,
                      kw={'centers': centers}, shape_hint=(points.shape[0],))
def canopy_cluster(points, t1=0.1, t2=0.1, cf=1):
  '''
  A simple implementation of canopy clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    t1(float): distance threshold between center point and the points within a canopy.
    t2(float): distance threshold between center point and the points within a canopy.
    cf(int): the minimum canopy size.
  '''
  new_points = expr.tile_operation(points, _canopy_mapper, kw={'t1': t1, 't2': t2, 'cf': cf}).evaluate()
  centers = find_centers(new_points.values(), t1, t2, cf)
  labels = expr.shuffle(points, _cluster_mapper, kw={'centers': centers}, shape_hint=(points.shape[0],))

  return labels
Exemple #3
0
def streaming_kmeans(points,
                     k=10,
                     num_iters=10,
                     num_ballkmeans_runs=2,
                     trim_factor=0.9,
                     test_probability=0.1,
                     correct_weight=False):
    '''
  clustering data points using streaming kmeans method.

  Args:
    points(DistArray): data points to be clustered.
    k(int): the final number of clusters.
    num_iters(int): the number of iterations to run in each ball kmeans run.
    num_ballkmeans_runs(int): the number of ball kmeans to run.
    trim_factor(float): the ball kmeans parameter to separate the nearest points and distant points.
    test_probability(float): the percentage of points to be chosen as test set.
    correct_weights(bool): whether to correct the weights of the centroids.
  '''
    centroids = expr.tile_operation(points, _streaming_mapper, kw={
        'k': k
    }).evaluate()

    new_centroids = []
    for tile_result in centroids.values():
        for centroids_list in tile_result:
            new_centroids.extend(centroids_list)

    centriods = ball_kmeans(new_centroids, k, num_iters, num_ballkmeans_runs,
                            trim_factor, test_probability, correct_weight)

    centers = np.zeros((k, points.shape[1]))
    for i in range(k):
        centers[i] = centriods[i].get_center()

    return expr.shuffle(points,
                        _cluster_mapper,
                        kw={'centers': centers},
                        shape_hint=(points.shape[0], ))
def canopy_cluster(points, t1=0.1, t2=0.1, cf=1):
    '''
  A simple implementation of canopy clustering method.
  
  Args:
    points(Expr or DistArray): the input data points matrix.
    t1(float): distance threshold between center point and the points within a canopy. 
    t2(float): distance threshold between center point and the points within a canopy.
    cf(int): the minimum canopy size.
  '''
    new_points = expr.tile_operation(points,
                                     _canopy_mapper,
                                     kw={
                                         't1': t1,
                                         't2': t2,
                                         'cf': cf
                                     }).force()
    centers = find_centers(new_points.values(), t1, t2, cf)
    labels = expr.shuffle(points,
                          _cluster_mapper,
                          kw={'centers': centers},
                          shape_hint=(points.shape[0], ))

    return labels