def learn_topics(terms_docs_matrix, k_topics, alpha=0.1, eta=0.1, max_iter=10, max_iter_per_doc=1): """ Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model. Args: terms_docs_matrix(Expr or DistArray): the count of each term in each document. k_topics: the number of topics we need to find. alpha(float): parameter of LDA model. eta(float): parameter of LDA model. max_iter(int):the max iterations to train LDA topic model. max_iter_per_doc: the max iterations to train each document. """ num_terms = terms_docs_matrix.shape[0] num_docs = terms_docs_matrix.shape[1] topic_term_counts = expr.rand(k_topics, num_terms) for i in range(max_iter): # topic_term_counts = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)), # _lda_mapper, # target=expr.ndarray((k_topics, num_terms), dtype=np.float64, reduce_fn=np.add), # kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc, #'topic_term_counts': topic_term_counts}).optimized() topic_term_counts = expr.outer( (terms_docs_matrix, topic_term_counts), (1, None), fn=_lda_mapper, fn_kw={"k_topics": k_topics, "alpha": alpha, "eta": eta, "max_iter_per_doc": max_iter_per_doc}, shape=(k_topics, num_terms), dtype=np.float64, reducer=np.add, ) # calculate the doc-topic inference # doc_topics = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)), # _lda_doc_topic_mapper, # kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc, #'topic_term_counts': topic_term_counts}, # shape_hint=(num_docs, k_topics)).optimized() doc_topics = expr.outer( (terms_docs_matrix, topic_term_counts), (1, None), fn=_lda_doc_topic_mapper, fn_kw={"k_topics": k_topics, "alpha": alpha, "eta": eta, "max_iter_per_doc": max_iter_per_doc}, shape=(num_docs, k_topics), dtype=np.float64, ) # normalize the topic-term distribution norm_val = expr.reduce( topic_term_counts, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: np.abs(data).sum(axis), accumulate_fn=np.add, ) topic_term_counts = topic_term_counts / norm_val.reshape((k_topics, 1)) topic_term_counts = topic_term_counts.optimized() return doc_topics, topic_term_counts
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10, M=None): ''' compute the factorization A = U M' using the alternating least-squares (ALS) method. where `A` is the "ratings" matrix which maps from a user and item to a rating score, `U` and `M` are the factor matrices, which represent user and item preferences. Args: A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score. la(float): the parameter of the als. alpha(int): confidence parameter used on implicit feedback. implicit_feedback(bool): whether using implicit_feedback method for als. num_features(int): dimension of the feature space. num_iter(int): max iteration to run. ''' num_users = A.shape[0] num_items = A.shape[1] AT = expr.transpose(A) avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0) M = expr.rand(num_items, num_features) M = expr.assign(M, np.s_[:, 0], avg_rating.reshape((avg_rating.shape[0], 1))) #A = expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0)) #AT = expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0)) for i in range(num_iter): # Recomputing U shape = (num_users, num_features) U = expr.outer((A, M), (0, None), fn=_solve_U_or_M_mapper, fn_kw={'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback, 'shape': shape}, shape=shape, dtype=np.float) # Recomputing M shape = (num_items, num_features) M = expr.outer((AT, U), (0, None), fn=_solve_U_or_M_mapper, fn_kw={'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback, 'shape': shape}, shape=shape, dtype=np.float) return U, M
def learn_topics(terms_docs_matrix, k_topics, alpha=0.1, eta=0.1, max_iter=10, max_iter_per_doc=1): ''' Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model. Args: terms_docs_matrix(Expr or DistArray): the count of each term in each document. k_topics: the number of topics we need to find. alpha(float): parameter of LDA model. eta(float): parameter of LDA model. max_iter(int):the max iterations to train LDA topic model. max_iter_per_doc: the max iterations to train each document. ''' num_terms = terms_docs_matrix.shape[0] num_docs = terms_docs_matrix.shape[1] topic_term_counts = expr.rand(k_topics, num_terms) for i in range(max_iter): #topic_term_counts = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)), #_lda_mapper, #target=expr.ndarray((k_topics, num_terms), dtype=np.float64, reduce_fn=np.add), #kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc, #'topic_term_counts': topic_term_counts}).optimized() topic_term_counts = expr.outer( (terms_docs_matrix, topic_term_counts), (1, None), fn=_lda_mapper, fn_kw={ 'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc }, shape=(k_topics, num_terms), dtype=np.float64, reducer=np.add) # calculate the doc-topic inference #doc_topics = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)), #_lda_doc_topic_mapper, #kw={'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc, #'topic_term_counts': topic_term_counts}, #shape_hint=(num_docs, k_topics)).optimized() doc_topics = expr.outer( (terms_docs_matrix, topic_term_counts), (1, None), fn=_lda_doc_topic_mapper, fn_kw={ 'k_topics': k_topics, 'alpha': alpha, 'eta': eta, 'max_iter_per_doc': max_iter_per_doc }, shape=(num_docs, k_topics), dtype=np.float64) # normalize the topic-term distribution norm_val = expr.reduce( topic_term_counts, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: np.abs(data).sum(axis), accumulate_fn=np.add) topic_term_counts = topic_term_counts / norm_val.reshape((k_topics, 1)) return doc_topics, topic_term_counts
def fit(self, X, centers=None, implementation='outer'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0],)) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={'d_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels}, shape_hint=(1,), cost_hint={hash(labels): {'00': 0, '01': np.prod(labels.shape)}}) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def fit(self, X, centers=None, implementation='map2'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0], )) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum( X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={ 'd_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels }, shape_hint=(1, ), cost_hint={ hash(labels): { '00': 0, '01': np.prod(labels.shape) } }) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn( n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10, M=None): ''' compute the factorization A = U M' using the alternating least-squares (ALS) method. where `A` is the "ratings" matrix which maps from a user and item to a rating score, `U` and `M` are the factor matrices, which represent user and item preferences. Args: A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score. la(float): the parameter of the als. alpha(int): confidence parameter used on implicit feedback. implicit_feedback(bool): whether using implicit_feedback method for als. num_features(int): dimension of the feature space. num_iter(int): max iteration to run. ''' num_users = A.shape[0] num_items = A.shape[1] AT = expr.transpose(A) avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0) M = expr.rand(num_items, num_features) M = expr.assign(M, np.s_[:, 0], avg_rating.reshape( (avg_rating.shape[0], 1))) #A = expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0)) #AT = expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0)) for i in range(num_iter): # Recomputing U shape = (num_users, num_features) U = expr.outer( (A, M), (0, None), fn=_solve_U_or_M_mapper, fn_kw={ 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback, 'shape': shape }, shape=shape, dtype=np.float) # Recomputing M shape = (num_items, num_features) M = expr.outer( (AT, U), (0, None), fn=_solve_U_or_M_mapper, fn_kw={ 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback, 'shape': shape }, shape=shape, dtype=np.float) return U, M