Exemple #1
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
  '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
  points = points.evaluate()
  num_dim = points.shape[1]
  if centers is None:
      centers = expr.rand(k, num_dim)

  #labels = expr.zeros((points.shape[0],), dtype=np.int)

  for iter in range(num_iter):
    centers = centers.glom()
    fuzzy = expr.map2(points, 0, fn=kmeans_map2_dist_mapper,
                      fn_kw={"centers": centers, "m": m},
                      shape=(points.shape[0], centers.shape[0]))
    labels = expr.argmax(fuzzy, axis=1)
    new_centers = expr.map2((points, fuzzy), (0, 0), fn=kmeans_map2_center_mapper,
                            fn_kw={"centers": centers, "m": m},
                            shape=(centers.shape[0], centers.shape[1]), reducer=np.add)
    new_centers /= expr.sum(fuzzy ** m, axis=0)[:, expr.newaxis]
    centers = new_centers
  return labels
Exemple #2
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
  '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
  points = expr.force(points)
  num_dim = points.shape[1]
  if centers is None:
      centers = expr.rand(k, num_dim)

  labels = expr.zeros((points.shape[0],), dtype=np.int)

  for iter in range(num_iter):
    centers = expr.as_array(centers)
    points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1]))
    centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1]))
    distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2)
    # This is used to avoid dividing zero
    distances = distances + 0.00000000001
    util.log_info('distances shape %s' % str(distances.shape))
    distances_broadcast = expr.reshape(distances, (distances.shape[0], 1,
                                                   distances.shape[1]))
    distances_broadcast2 = expr.reshape(distances, (distances.shape[0],
                                                    distances.shape[1], 1))
    prob = 1.0 / expr.sum(expr.power(distances_broadcast / distances_broadcast2,
                                     2.0 / (m - 1)), axis=2)
    prob.force()
    counts = expr.sum(prob, axis=0)
    counts = expr.reshape(counts, (counts.shape[0], 1))
    labels = expr.argmax(prob, axis=1)
    centers = expr.sum(expr.reshape(points, (points.shape[0], 1, points.shape[1])) *
                       expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)),
                       axis=0)

    # We assume that the size of centers are relative small that can be handled
    # on the master.
    counts = counts.glom()
    centers = centers.glom()
    # If any centroids don't have any points assigned to them.
    zcount_indices = (counts == 0).reshape(k)

    if np.any(zcount_indices):
      # One or more centroids may not have any points assigned to them, which results in their
      # position being the zero-vector.  We reseed these centroids with new random values
      # and set their counts to 1 in order to get rid of dividing by zero.
      counts[zcount_indices, :] = 1
      centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices),
                                                  num_dim)

    centers = centers / counts
  return labels
Exemple #3
0
  def train_smo_2005(self, data, labels):
    '''
    Train an SVM model using the SMO (2005) algorithm.
   
    Args:
      data(Expr): points to be trained
      labels(Expr): the correct labels of the training data
    '''
    
    N = data.shape[0] # Number of instances
    D = data.shape[1]  # Number of features

    self.b = 0.0
    alpha = expr.zeros((N,1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]).force()
    
    # linear kernel
    kernel_results = expr.dot(data, expr.transpose(data), tile_hint=[N/self.ctx.num_workers, N])
    gradient = expr.ones((N, 1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]) * -1.0
    
    expr_labels = expr.lazify(labels)
    
    util.log_info("Starting SMO")
    pv1 = pv2 = -1
    it = 0
    while it < self.maxiter:
      util.log_info("Iteration:%d", it)
      
      minObj = 1e100
      
      expr_alpha = expr.lazify(alpha)
      G = expr.multiply(labels, gradient) * -1.0

      v1_mask = ((expr_labels > self.tol) * (expr_alpha < self.C) + (expr_labels < -self.tol) * (expr_alpha > self.tol))
      v1 = expr.argmax(G[v1_mask-True]).glom().item()
      maxG = G[v1,0].glom()
      print 'maxv1:', v1, 'maxG:', maxG

      v2_mask = ((expr_labels > self.tol) * (expr_alpha > self.tol) + (expr_labels < -self.tol) * (expr_alpha < self.C))     
      min_v2 = expr.argmin(G[v2_mask-True]).glom().item()
      minG = G[min_v2,0].glom()
      #print 'minv2:', min_v2, 'minG:', minG
      
      set_v2 = v2_mask.glom().nonzero()[0]
      #print 'actives:', set_v2.shape[0]
      v2 = -1
      for v in set_v2:
        b = maxG - G[v,0].glom()
        if b > self.tol:
          na = (kernel_results[v1,v1] + kernel_results[v,v] - 2*kernel_results[v1,v]).glom()[0][0]
          if na < self.tol: na = 1e12
          
          obj = -(b*b)/na
          if obj <= minObj and v1 != pv1 or v != pv2:
            v2 = v
            a = na
            minObj = obj
      
      if v2 == -1: break
      if maxG - minG < self.tol: break
      
      print 'opt v1:', v1, 'v2:', v2

      pv1 = v1
      pv2 = v2
    
      y1 = labels[v1,0]
      y2 = labels[v2,0]    
        
      oldA1 = alpha[v1,0]
      oldA2 = alpha[v2,0]
      
      # Calculate new alpha values, to reduce the objective function...
      b = y2*expr.glom(gradient[v2,0]) - y1*expr.glom(gradient[v1,0])
      if y1 != y2:
        a += 4 * kernel_results[v1,v2].glom()
      
      newA1 = oldA1 + y1*b/a
      newA2 = oldA2 - y2*b/a   

      # Correct for alpha being out of range...
      sum = y1*oldA1 + y2*oldA2;
  
      if newA1 < self.tol: newA1 = 0.0
      elif newA1 > self.C: newA1 = self.C
     
      newA2 = y2 * (sum - y1 * newA1) 

      if newA2 < self.tol: newA2 = 0.0;
      elif newA2 > self.C: newA2 = self.C
     
      newA1 = y1 * (sum - y2 * newA2)
  
      # Update the gradient...
      dA1 = newA1 - oldA1
      dA2 = newA2 - oldA2
  
      gradient += expr.multiply(labels, kernel_results[:,v1]) * y1 * dA1 + expr.multiply(labels, kernel_results[:,v2]) * y2 * dA2

      alpha[v1,0] = newA1
      alpha[v2,0] = newA2
 
      #print 'alpha:', alpha.glom().T
      
      it += 1
      #print 'gradient:', gradient.glom().T

    self.w = expr.zeros((D, 1), dtype=np.float64).force()
    for i in xrange(D): 
      self.w[i,0] = expr.reduce(alpha, axis=None, dtype_fn=lambda input: input.dtype,
                                local_reduce_fn=margin_mapper,
                                accumulate_fn=np.add, 
                                fn_kw=dict(label=labels, data=expr.force(data[:,i]))).glom()
    
    self.b = 0.0
    E = (labels - self.margins(data)).force()
    
    minB = -1e100
    maxB = 1e100
    actualB = 0.0
    numActualB = 0
    
    for i in xrange(N):
      ai = alpha[i,0]
      yi = labels[i,0]
      Ei = E[i,0]
      
      if ai < 1e-3:
        if yi < self.tol:
          maxB = min((maxB,Ei))
        else:
          minB = max((minB,Ei))
      elif ai > self.C - 1e-3:
        if yi < self.tol:
          minB = max((minB,Ei))
        else:
          maxB = min((maxB,Ei))
      else:
        numActualB += 1
        actualB += (Ei - actualB) / float(numActualB)
    if numActualB > 0:
      self.b = actualB
    else:
      self.b = 0.5*(minB + maxB)

    self.usew_ = True
    print 'iteration finish:', it
    print 'b:', self.b
    print 'w:', self.w.glom()
Exemple #4
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
    '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
    points = expr.force(points)
    num_dim = points.shape[1]
    if centers is None:
        centers = expr.rand(k, num_dim)

    labels = expr.zeros((points.shape[0], ), dtype=np.int)

    for iter in range(num_iter):
        centers = expr.as_array(centers)
        points_broadcast = expr.reshape(points,
                                        (points.shape[0], 1, points.shape[1]))
        centers_broadcast = expr.reshape(
            centers, (1, centers.shape[0], centers.shape[1]))
        distances = expr.sum(expr.square(points_broadcast - centers_broadcast),
                             axis=2)
        # This is used to avoid dividing zero
        distances = distances + 0.00000000001
        util.log_info('distances shape %s' % str(distances.shape))
        distances_broadcast = expr.reshape(
            distances, (distances.shape[0], 1, distances.shape[1]))
        distances_broadcast2 = expr.reshape(
            distances, (distances.shape[0], distances.shape[1], 1))
        prob = 1.0 / expr.sum(expr.power(
            distances_broadcast / distances_broadcast2, 2.0 / (m - 1)),
                              axis=2)
        prob.force()
        counts = expr.sum(prob, axis=0)
        counts = expr.reshape(counts, (counts.shape[0], 1))
        labels = expr.argmax(prob, axis=1)
        centers = expr.sum(
            expr.reshape(points, (points.shape[0], 1, points.shape[1])) *
            expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)),
            axis=0)

        # We assume that the size of centers are relative small that can be handled
        # on the master.
        counts = counts.glom()
        centers = centers.glom()
        # If any centroids don't have any points assigned to them.
        zcount_indices = (counts == 0).reshape(k)

        if np.any(zcount_indices):
            # One or more centroids may not have any points assigned to them, which results in their
            # position being the zero-vector.  We reseed these centroids with new random values
            # and set their counts to 1 in order to get rid of dividing by zero.
            counts[zcount_indices, :] = 1
            centers[zcount_indices, :] = np.random.rand(
                np.count_nonzero(zcount_indices), num_dim)

        centers = centers / counts
    return labels
Exemple #5
0
    def train_smo_2005(self, data, labels):
        '''
    Train an SVM model using the SMO (2005) algorithm.
   
    Args:
      data(Expr): points to be trained
      labels(Expr): the correct labels of the training data
    '''

        N = data.shape[0]  # Number of instances
        D = data.shape[1]  # Number of features

        self.b = 0.0
        alpha = expr.zeros((N, 1),
                           dtype=np.float64,
                           tile_hint=[N / self.ctx.num_workers, 1]).force()

        # linear kernel
        kernel_results = expr.dot(data,
                                  expr.transpose(data),
                                  tile_hint=[N / self.ctx.num_workers, N])
        gradient = expr.ones(
            (N, 1), dtype=np.float64, tile_hint=[N / self.ctx.num_workers, 1
                                                 ]) * -1.0

        expr_labels = expr.lazify(labels)

        util.log_info("Starting SMO")
        pv1 = pv2 = -1
        it = 0
        while it < self.maxiter:
            util.log_info("Iteration:%d", it)

            minObj = 1e100

            expr_alpha = expr.lazify(alpha)
            G = expr.multiply(labels, gradient) * -1.0

            v1_mask = ((expr_labels > self.tol) * (expr_alpha < self.C) +
                       (expr_labels < -self.tol) * (expr_alpha > self.tol))
            v1 = expr.argmax(G[v1_mask - True]).glom().item()
            maxG = G[v1, 0].glom()
            print 'maxv1:', v1, 'maxG:', maxG

            v2_mask = ((expr_labels > self.tol) * (expr_alpha > self.tol) +
                       (expr_labels < -self.tol) * (expr_alpha < self.C))
            min_v2 = expr.argmin(G[v2_mask - True]).glom().item()
            minG = G[min_v2, 0].glom()
            #print 'minv2:', min_v2, 'minG:', minG

            set_v2 = v2_mask.glom().nonzero()[0]
            #print 'actives:', set_v2.shape[0]
            v2 = -1
            for v in set_v2:
                b = maxG - G[v, 0].glom()
                if b > self.tol:
                    na = (kernel_results[v1, v1] + kernel_results[v, v] -
                          2 * kernel_results[v1, v]).glom()[0][0]
                    if na < self.tol: na = 1e12

                    obj = -(b * b) / na
                    if obj <= minObj and v1 != pv1 or v != pv2:
                        v2 = v
                        a = na
                        minObj = obj

            if v2 == -1: break
            if maxG - minG < self.tol: break

            print 'opt v1:', v1, 'v2:', v2

            pv1 = v1
            pv2 = v2

            y1 = labels[v1, 0]
            y2 = labels[v2, 0]

            oldA1 = alpha[v1, 0]
            oldA2 = alpha[v2, 0]

            # Calculate new alpha values, to reduce the objective function...
            b = y2 * expr.glom(gradient[v2, 0]) - y1 * expr.glom(gradient[v1,
                                                                          0])
            if y1 != y2:
                a += 4 * kernel_results[v1, v2].glom()

            newA1 = oldA1 + y1 * b / a
            newA2 = oldA2 - y2 * b / a

            # Correct for alpha being out of range...
            sum = y1 * oldA1 + y2 * oldA2

            if newA1 < self.tol: newA1 = 0.0
            elif newA1 > self.C: newA1 = self.C

            newA2 = y2 * (sum - y1 * newA1)

            if newA2 < self.tol: newA2 = 0.0
            elif newA2 > self.C: newA2 = self.C

            newA1 = y1 * (sum - y2 * newA2)

            # Update the gradient...
            dA1 = newA1 - oldA1
            dA2 = newA2 - oldA2

            gradient += expr.multiply(
                labels, kernel_results[:, v1]) * y1 * dA1 + expr.multiply(
                    labels, kernel_results[:, v2]) * y2 * dA2

            alpha[v1, 0] = newA1
            alpha[v2, 0] = newA2

            #print 'alpha:', alpha.glom().T

            it += 1
            #print 'gradient:', gradient.glom().T

        self.w = expr.zeros((D, 1), dtype=np.float64).force()
        for i in xrange(D):
            self.w[i, 0] = expr.reduce(alpha,
                                       axis=None,
                                       dtype_fn=lambda input: input.dtype,
                                       local_reduce_fn=margin_mapper,
                                       accumulate_fn=np.add,
                                       fn_kw=dict(label=labels,
                                                  data=expr.force(
                                                      data[:, i]))).glom()

        self.b = 0.0
        E = (labels - self.margins(data)).force()

        minB = -1e100
        maxB = 1e100
        actualB = 0.0
        numActualB = 0

        for i in xrange(N):
            ai = alpha[i, 0]
            yi = labels[i, 0]
            Ei = E[i, 0]

            if ai < 1e-3:
                if yi < self.tol:
                    maxB = min((maxB, Ei))
                else:
                    minB = max((minB, Ei))
            elif ai > self.C - 1e-3:
                if yi < self.tol:
                    minB = max((minB, Ei))
                else:
                    maxB = min((maxB, Ei))
            else:
                numActualB += 1
                actualB += (Ei - actualB) / float(numActualB)
        if numActualB > 0:
            self.b = actualB
        else:
            self.b = 0.5 * (minB + maxB)

        self.usew_ = True
        print 'iteration finish:', it
        print 'b:', self.b
        print 'w:', self.w.glom()