def fit(data, labels, label_size, alpha=1.0): ''' Train standard naive bayes model. Args: data(Expr): documents to be trained. labels(Expr): the correct labels of the training data. label_size(int): the number of different labels. alpha(float): alpha parameter of naive bayes model. ''' # calc document freq df = expr.reduce(data, axis=0, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis), accumulate_fn=np.add) idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1 # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency # by the root mean square of features frequencies in that document square_sum = expr.reduce(data, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis), accumulate_fn=np.add) rms = expr.sqrt(square_sum * 1.0 / data.shape[1]) # calculate weight normalized Tf-Idf data = data / rms.reshape((data.shape[0], 1)) * idf.reshape((1, data.shape[1])) # add up all the feature vectors with the same labels #weights_per_label_and_feature = expr.ndarray((label_size, data.shape[1]), dtype=np.float64) #for i in range(label_size): # i_mask = (labels == i) # weights_per_label_and_feature = expr.assign(weights_per_label_and_feature, np.s_[i, :], expr.sum(data[i_mask, :], axis=0)) weights_per_label_and_feature = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)), _sum_instance_by_label_mapper, target=expr.ndarray((label_size, data.shape[1]), dtype=np.float64, reduce_fn=np.add), kw={'labels': labels, 'label_size': label_size}, cost_hint={hash(labels):{'00':0, '01':np.prod(labels.shape)}}) # sum up all the weights for each label from the previous step weights_per_label = expr.sum(weights_per_label_and_feature, axis=1) # generate naive bayes per_label_and_feature weights weights_per_label_and_feature = expr.log((weights_per_label_and_feature + alpha) / (weights_per_label.reshape((weights_per_label.shape[0], 1)) + alpha * weights_per_label_and_feature.shape[1])) return {'scores_per_label_and_feature': weights_per_label_and_feature.optimized().force(), 'scores_per_label': weights_per_label.optimized().force(), }
def test_pca(self): FLAGS.opt_parakeet_gen = 0 data = np.random.randn(*DIM) A = expr.from_numpy(data, tile_hint=util.calc_tile_hint(DIM, axis=0)) m = PCA(N_COMPONENTS) m2 = SK_PCA(N_COMPONENTS) m.fit(A) m2.fit(data) print m2.components_ - m.components_ assert np.allclose(absolute(m.components_), absolute(m2.components_))
def learn_topics(terms_docs_matrix, k_topics, alpha=0.1, eta=0.1, max_iter=10, max_iter_per_doc=1): ''' Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model. Args: terms_docs_matrix(Expr or DistArray): the count of each term in each document. k_topics: the number of topics we need to find. alpha(float): parameter of LDA model. eta(float): parameter of LDA model. max_iter(int):the max iterations to train LDA topic model. max_iter_per_doc: the max iterations to train each document. ''' num_terms = terms_docs_matrix.shape[0] num_docs = terms_docs_matrix.shape[1] topic_term_counts = expr.rand(k_topics, num_terms) for i in range(max_iter): topic_term_counts = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)), _lda_mapper, target=expr.ndarray((k_topics, num_terms), dtype=np.float64, reduce_fn=np.add), kw={'k_topics': k_topics, 'alpha': alpha, 'eta':eta, 'max_iter_per_doc': max_iter_per_doc, 'topic_term_counts': topic_term_counts}).optimized() # calculate the doc-topic inference doc_topics = expr.shuffle(expr.retile(terms_docs_matrix, tile_hint=util.calc_tile_hint(terms_docs_matrix, axis=1)), _lda_doc_topic_mapper, kw={'k_topics': k_topics, 'alpha': alpha, 'eta':eta, 'max_iter_per_doc': max_iter_per_doc, 'topic_term_counts': topic_term_counts}, shape_hint=(num_docs, k_topics)).optimized() # normalize the topic-term distribution norm_val = expr.reduce(topic_term_counts, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis:np.abs(data).sum(axis), accumulate_fn=np.add) topic_term_counts = topic_term_counts / norm_val.reshape((k_topics, 1)) topic_term_counts = topic_term_counts.optimized() return doc_topics, topic_term_counts
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10): ''' compute the factorization A = U M' using the alternating least-squares (ALS) method. where `A` is the "ratings" matrix which maps from a user and item to a rating score, `U` and `M` are the factor matrices, which represent user and item preferences. Args: A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score. la(float): the parameter of the als. alpha(int): confidence parameter used on implicit feedback. implicit_feedback(bool): whether using implicit_feedback method for als. num_features(int): dimension of the feature space. num_iter(int): max iteration to run. ''' num_users = A.shape[0] num_items = A.shape[1] AT = expr.transpose(A) avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0) M = expr.rand(num_items, num_features) M = expr.assign(M, np.s_[:, 0], avg_rating.reshape((avg_rating.shape[0], 1))) for i in range(num_iter): # Recomputing U U = expr.shuffle(expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0)), _solve_U_or_M_mapper, kw={'U_or_M': M, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}, shape_hint=(num_users, num_features)).optimized() # Recomputing M M = expr.shuffle(expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0)), _solve_U_or_M_mapper, kw={'U_or_M': U, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}, shape_hint=(num_items, num_features)).optimized() return U, M
def __init__(self, rating_table, k=10): '''Based on the user-item ratings, recommend items to user. Parameters ---------- rating_table : Spartan array of shape (N_USERS, N_ITEMS). Array which represents the ratings of user(M, N) M is number of user, N is number of items. Mi,j means the rating of user i on item j. k : integer. The number of most similar items for each item needs to be precomputed. It must be less or equal than the number of items. ''' assert rating_table.shape[1] >= k,\ "The number of items must be grater or equal than k!" self.rating_table = expr.retile(rating_table, tile_hint=util.calc_tile_hint(rating_table, axis=1)) self.k = k
def __init__(self, rating_table, k=10): '''Based on the user-item ratings, recommend items to user. Parameters ---------- rating_table : Spartan array of shape (N_USERS, N_ITEMS). Array which represents the ratings of user(M, N) M is number of user, N is number of items. Mi,j means the rating of user i on item j. k : integer. The number of most similar items for each item needs to be precomputed. It must be less or equal than the number of items. ''' assert rating_table.shape[1] >= k,\ "The number of items must be grater or equal than k!" self.rating_table = expr.retile(rating_table, tile_hint=util.calc_tile_hint( rating_table, axis=1)) self.k = k
def fit(data, labels, T=50, la=1.0): ''' Train an SVM model using the disdca (2013) algorithm. Args: data(Expr): points to be trained. labels(Expr): the correct labels of the training data. T(int): max training iterations. la(float): lambda parameter of this SVM model. ''' w = expr.zeros((data.shape[1], 1), dtype=np.float64) alpha = expr.zeros((data.shape[0], 1), dtype=np.float64) for i in range(T): alpha = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)), _svm_mapper, kw={'labels': labels, 'alpha': alpha, 'w': w, 'lambda_n': la * data.shape[0]}, shape_hint=alpha.shape, cost_hint={ hash(labels) : {'00': 0, '01': np.prod(labels.shape)}, hash(alpha) : {'00': 0, '01': np.prod(alpha.shape)} }) w = expr.sum(data * alpha * 1.0 / la / data.shape[0], axis=0).reshape((data.shape[1], 1)) w = w.optimized() return w
def fit(data, labels, label_size, alpha=1.0): ''' Train standard naive bayes model. Args: data(Expr): documents to be trained. labels(Expr): the correct labels of the training data. label_size(int): the number of different labels. alpha(float): alpha parameter of naive bayes model. ''' # calc document freq df = expr.reduce(data, axis=0, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis), accumulate_fn=np.add) idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1 # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency # by the root mean square of features frequencies in that document square_sum = expr.reduce( data, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis), accumulate_fn=np.add) rms = expr.sqrt(square_sum * 1.0 / data.shape[1]) # calculate weight normalized Tf-Idf data = data / rms.reshape((data.shape[0], 1)) * idf.reshape( (1, data.shape[1])) # add up all the feature vectors with the same labels #weights_per_label_and_feature = expr.ndarray((label_size, data.shape[1]), dtype=np.float64) #for i in range(label_size): # i_mask = (labels == i) # weights_per_label_and_feature = expr.assign(weights_per_label_and_feature, np.s_[i, :], expr.sum(data[i_mask, :], axis=0)) weights_per_label_and_feature = expr.shuffle( expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)), _sum_instance_by_label_mapper, target=expr.ndarray((label_size, data.shape[1]), dtype=np.float64, reduce_fn=np.add), kw={ 'labels': labels, 'label_size': label_size }, cost_hint={hash(labels): { '00': 0, '01': np.prod(labels.shape) }}) # sum up all the weights for each label from the previous step weights_per_label = expr.sum(weights_per_label_and_feature, axis=1) # generate naive bayes per_label_and_feature weights weights_per_label_and_feature = expr.log( (weights_per_label_and_feature + alpha) / (weights_per_label.reshape((weights_per_label.shape[0], 1)) + alpha * weights_per_label_and_feature.shape[1])) return { 'scores_per_label_and_feature': weights_per_label_and_feature.optimized().force(), 'scores_per_label': weights_per_label.optimized().force(), }