Ejemplo n.º 1
0
def learn_topics(terms_docs_matrix, k_topics, alpha=0.1, eta=0.1, max_iter=10, max_iter_per_doc=1):
    """
  Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model.

  Args:
    terms_docs_matrix(Expr or DistArray): the count of each term in each document.
    k_topics: the number of topics we need to find.
    alpha(float): parameter of LDA model.
    eta(float): parameter of LDA model.
    max_iter(int):the max iterations to train LDA topic model.
    max_iter_per_doc: the max iterations to train each document.
  """
    num_terms = terms_docs_matrix.shape[0]
    num_docs = terms_docs_matrix.shape[1]

    topic_term_counts = expr.rand(k_topics, num_terms)
    for i in range(max_iter):
        # topic_term_counts = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)),
        # _lda_mapper,
        # target=expr.ndarray((k_topics, num_terms), dtype=np.float64, reduce_fn=np.add),
        # kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc,
        #'topic_term_counts': topic_term_counts}).optimized()
        topic_term_counts = expr.outer(
            (terms_docs_matrix, topic_term_counts),
            (1, None),
            fn=_lda_mapper,
            fn_kw={"k_topics": k_topics, "alpha": alpha, "eta": eta, "max_iter_per_doc": max_iter_per_doc},
            shape=(k_topics, num_terms),
            dtype=np.float64,
            reducer=np.add,
        )
    # calculate the doc-topic inference
    # doc_topics = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)),
    # _lda_doc_topic_mapper,
    # kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc,
    #'topic_term_counts': topic_term_counts},
    # shape_hint=(num_docs, k_topics)).optimized()
    doc_topics = expr.outer(
        (terms_docs_matrix, topic_term_counts),
        (1, None),
        fn=_lda_doc_topic_mapper,
        fn_kw={"k_topics": k_topics, "alpha": alpha, "eta": eta, "max_iter_per_doc": max_iter_per_doc},
        shape=(num_docs, k_topics),
        dtype=np.float64,
    )

    # normalize the topic-term distribution
    norm_val = expr.reduce(
        topic_term_counts,
        axis=1,
        dtype_fn=lambda input: input.dtype,
        local_reduce_fn=lambda ex, data, axis: np.abs(data).sum(axis),
        accumulate_fn=np.add,
    )
    topic_term_counts = topic_term_counts / norm_val.reshape((k_topics, 1))
    topic_term_counts = topic_term_counts.optimized()
    return doc_topics, topic_term_counts
Ejemplo n.º 2
0
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10, M=None):
  '''
  compute the factorization A = U M' using the alternating least-squares (ALS) method.

  where `A` is the "ratings" matrix which maps from a user and item to a rating score,
        `U` and `M` are the factor matrices, which represent user and item preferences.
  Args:
    A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score.
    la(float): the parameter of the als.
    alpha(int): confidence parameter used on implicit feedback.
    implicit_feedback(bool): whether using implicit_feedback method for als.
    num_features(int): dimension of the feature space.
    num_iter(int): max iteration to run.
  '''
  num_users = A.shape[0]
  num_items = A.shape[1]

  AT = expr.transpose(A)

  avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0)

  M = expr.rand(num_items, num_features)
  M = expr.assign(M, np.s_[:, 0], avg_rating.reshape((avg_rating.shape[0], 1)))

  #A = expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0))
  #AT = expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0))
  for i in range(num_iter):
    # Recomputing U
    shape = (num_users, num_features)
    U = expr.outer((A, M), (0, None), fn=_solve_U_or_M_mapper,
                   fn_kw={'la': la, 'alpha': alpha,
                          'implicit_feedback': implicit_feedback, 'shape': shape},
                   shape=shape, dtype=np.float)
    # Recomputing M
    shape = (num_items, num_features)
    M = expr.outer((AT, U), (0, None), fn=_solve_U_or_M_mapper,
                   fn_kw={'la': la, 'alpha': alpha,
                          'implicit_feedback': implicit_feedback, 'shape': shape},
                   shape=shape, dtype=np.float)
  return U, M
Ejemplo n.º 3
0
def learn_topics(terms_docs_matrix,
                 k_topics,
                 alpha=0.1,
                 eta=0.1,
                 max_iter=10,
                 max_iter_per_doc=1):
    '''
  Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model.

  Args:
    terms_docs_matrix(Expr or DistArray): the count of each term in each document.
    k_topics: the number of topics we need to find.
    alpha(float): parameter of LDA model.
    eta(float): parameter of LDA model.
    max_iter(int):the max iterations to train LDA topic model.
    max_iter_per_doc: the max iterations to train each document.
  '''
    num_terms = terms_docs_matrix.shape[0]
    num_docs = terms_docs_matrix.shape[1]

    topic_term_counts = expr.rand(k_topics, num_terms)
    for i in range(max_iter):
        #topic_term_counts = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)),
        #_lda_mapper,
        #target=expr.ndarray((k_topics, num_terms), dtype=np.float64, reduce_fn=np.add),
        #kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc,
        #'topic_term_counts': topic_term_counts}).optimized()
        topic_term_counts = expr.outer(
            (terms_docs_matrix, topic_term_counts), (1, None),
            fn=_lda_mapper,
            fn_kw={
                'k_topics': k_topics,
                'alpha': alpha,
                'eta': eta,
                'max_iter_per_doc': max_iter_per_doc
            },
            shape=(k_topics, num_terms),
            dtype=np.float64,
            reducer=np.add)
    # calculate the doc-topic inference
    #doc_topics = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)),
    #_lda_doc_topic_mapper,
    #kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc,
    #'topic_term_counts': topic_term_counts},
    #shape_hint=(num_docs, k_topics)).optimized()
    doc_topics = expr.outer(
        (terms_docs_matrix, topic_term_counts), (1, None),
        fn=_lda_doc_topic_mapper,
        fn_kw={
            'k_topics': k_topics,
            'alpha': alpha,
            'eta': eta,
            'max_iter_per_doc': max_iter_per_doc
        },
        shape=(num_docs, k_topics),
        dtype=np.float64)

    # normalize the topic-term distribution
    norm_val = expr.reduce(
        topic_term_counts,
        axis=1,
        dtype_fn=lambda input: input.dtype,
        local_reduce_fn=lambda ex, data, axis: np.abs(data).sum(axis),
        accumulate_fn=np.add)
    topic_term_counts = topic_term_counts / norm_val.reshape((k_topics, 1))
    return doc_topics, topic_term_counts
Ejemplo n.º 4
0
  def fit(self, X, centers=None, implementation='outer'):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if implementation == 'map2':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers},
                           shape=(X.shape[0], ))

        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
      return centers, labels

    elif implementation == 'outer':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper,
                            shape=(X.shape[0],))
        #labels = expr.argmin(distances, axis=1)
        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'broadcast':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        util.log_warn("k_means_ %d %d", i, time.time())
        X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
        centers_broadcast = expr.reshape(centers, (1, centers.shape[0],
                                                   centers.shape[1]))
        distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
        labels = expr.argmin(distances, axis=1)
        center_idx = expr.arange((1, centers.shape[0]))
        matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
        matches = matches.astype(np.int64)
        counts = expr.sum(matches, axis=0)
        centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                                matches.shape[1], 1)),
                           axis=0)

        counts = counts.optimized().glom()
        centers = centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'shuffle':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        # Reset them to zero.
        new_centers = expr.ndarray((self.n_clusters, num_dim),
                                   reduce_fn=lambda a, b: a + b)
        new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int,
                                  reduce_fn=lambda a, b: a + b)

        _ = expr.shuffle(X,
                         _find_cluster_mapper,
                         kw={'d_pts': X,
                             'old_centers': centers,
                             'new_centers': new_centers,
                             'new_counts': new_counts,
                             'labels': labels},
                         shape_hint=(1,),
                         cost_hint={hash(labels): {'00': 0,
                                                   '01': np.prod(labels.shape)}})
        _.force()

        new_counts = new_counts.glom()
        new_centers = new_centers.glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (new_counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          new_counts[zcount_indices] = 1
          new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        new_centers = new_centers / new_counts
        centers = new_centers

      return centers, labels
Ejemplo n.º 5
0
    def fit(self, X, centers=None, implementation='map2'):
        """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
        num_dim = X.shape[1]
        num_points = X.shape[0]

        labels = expr.zeros((num_points, 1), dtype=np.int)

        if implementation == 'map2':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.map2(X,
                                   0,
                                   fn=kmeans_map2_dist_mapper,
                                   fn_kw={"centers": centers},
                                   shape=(X.shape[0], ))

                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
            return centers, labels

        elif implementation == 'outer':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.outer((X, centers), (0, None),
                                    fn=kmeans_outer_dist_mapper,
                                    shape=(X.shape[0], ))
                #labels = expr.argmin(distances, axis=1)
                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'broadcast':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                util.log_warn("k_means_ %d %d", i, time.time())
                X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
                centers_broadcast = expr.reshape(
                    centers, (1, centers.shape[0], centers.shape[1]))
                distances = expr.sum(expr.square(X_broadcast -
                                                 centers_broadcast),
                                     axis=2)
                labels = expr.argmin(distances, axis=1)
                center_idx = expr.arange((1, centers.shape[0]))
                matches = expr.reshape(labels,
                                       (labels.shape[0], 1)) == center_idx
                matches = matches.astype(np.int64)
                counts = expr.sum(matches, axis=0)
                centers = expr.sum(
                    X_broadcast *
                    expr.reshape(matches,
                                 (matches.shape[0], matches.shape[1], 1)),
                    axis=0)

                counts = counts.optimized().glom()
                centers = centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'shuffle':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                # Reset them to zero.
                new_centers = expr.ndarray((self.n_clusters, num_dim),
                                           reduce_fn=lambda a, b: a + b)
                new_counts = expr.ndarray((self.n_clusters, 1),
                                          dtype=np.int,
                                          reduce_fn=lambda a, b: a + b)

                _ = expr.shuffle(X,
                                 _find_cluster_mapper,
                                 kw={
                                     'd_pts': X,
                                     'old_centers': centers,
                                     'new_centers': new_centers,
                                     'new_counts': new_counts,
                                     'labels': labels
                                 },
                                 shape_hint=(1, ),
                                 cost_hint={
                                     hash(labels): {
                                         '00': 0,
                                         '01': np.prod(labels.shape)
                                     }
                                 })
                _.force()

                new_counts = new_counts.glom()
                new_centers = new_centers.glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (new_counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    new_counts[zcount_indices] = 1
                    new_centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                new_centers = new_centers / new_counts
                centers = new_centers

            return centers, labels
Ejemplo n.º 6
0
def als(A,
        la=0.065,
        alpha=40,
        implicit_feedback=False,
        num_features=20,
        num_iter=10,
        M=None):
    '''
  compute the factorization A = U M' using the alternating least-squares (ALS) method.

  where `A` is the "ratings" matrix which maps from a user and item to a rating score,
        `U` and `M` are the factor matrices, which represent user and item preferences.
  Args:
    A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score.
    la(float): the parameter of the als.
    alpha(int): confidence parameter used on implicit feedback.
    implicit_feedback(bool): whether using implicit_feedback method for als.
    num_features(int): dimension of the feature space.
    num_iter(int): max iteration to run.
  '''
    num_users = A.shape[0]
    num_items = A.shape[1]

    AT = expr.transpose(A)

    avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0)

    M = expr.rand(num_items, num_features)
    M = expr.assign(M, np.s_[:, 0], avg_rating.reshape(
        (avg_rating.shape[0], 1)))

    #A = expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0))
    #AT = expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0))
    for i in range(num_iter):
        # Recomputing U
        shape = (num_users, num_features)
        U = expr.outer(
            (A, M), (0, None),
            fn=_solve_U_or_M_mapper,
            fn_kw={
                'la': la,
                'alpha': alpha,
                'implicit_feedback': implicit_feedback,
                'shape': shape
            },
            shape=shape,
            dtype=np.float)
        # Recomputing M
        shape = (num_items, num_features)
        M = expr.outer(
            (AT, U), (0, None),
            fn=_solve_U_or_M_mapper,
            fn_kw={
                'la': la,
                'alpha': alpha,
                'implicit_feedback': implicit_feedback,
                'shape': shape
            },
            shape=shape,
            dtype=np.float)
    return U, M