def train(self, train_data_rv): self.lda = Lda(train_data_rv, self.k, self.vocab_size) self.lda.gibbs_iter = self.train_gibbsiter self.lda.hyperparams_iter = self.train_hyperparamsiter self.lda.hyper_parameter_number_of_samples_to_average = self.nb_averages self.lda.fit(self.train_hyperparamsiter, self.train_gibbsiter)
def update_lda(self, cluster, data, data_lens, n_cat, n_em_itr, labels=None): n_states = cluster.n_states lda = self.lda_mdl(n_states, n_cat) if lda is None: states = cluster.expt_s.argmax(0) s_list = [ states[b:e] for b, e in _data_len_ibgn_iend_itr(data_lens) ] lda = Lda(n_states, n_cat) n_batches = len(data_lens) lda.set_default_params(n_batches) lda.init_expt_z(data_lens) lda.update(s_list, n_em_itr) self._lda_dic[(n_states, n_cat)] = lda return lda
if __name__ == '__main__': # Settings use_cache = False update_html = True # Retrieve and prepare dataset newsapi = NewsboxApi() published_since_n_days = (datetime.date.today() - datetime.timedelta(days=PUBLISHED_BEFORE_N_DAYS)).isoformat() articles = newsapi.list_articles(language=LANGUAGE, published_after=published_since_n_days, from_cache=use_cache) texts = prepare_articles(articles=articles, from_cache=use_cache) # Train LDA lda = Lda() lda.train_lda(texts=texts, num_topics=NUM_TOPIC) lda.persist_lda() lda.export_html() # lda.visualize() # Update lda html for newsmap if update_html: minioapi = MinioApi() bucket_name = 'newsmap' minioapi.create_bucket(bucket_name=bucket_name) minioapi.upload_file(bucket_name=bucket_name, filename='index.html', file='artifacts/lda/index.html') minioapi.make_public(bucket_name=bucket_name)
def __init__(self, trdata_path, vadata_path, tedata_path, trtopic_path, vatopic_path, tetopic_path, beta_path, word_map_path): ## Parameters settings self.train_iters = 100 self.infer_iters = 10 self.grad_iters = 100 self.K = 40 self.max_words = 10000 self.l_u = 0.1 self.l_i = 0.1 self.init_topic = 0 # 0=init from files, 1=init from implemented by us ## Reading review data self.trdata_path = trdata_path self.vadata_path = vadata_path self.tedata_path = tedata_path self.corp = Corpus(trdata_path, vadata_path, tedata_path, self.max_words) self.n_users = self.corp.n_users self.n_items = self.corp.n_items self.n_words = self.corp.n_words print 'N_users=%d, N_items=%d, N_words=%d, N_trreviews=%d' % (self.n_users, self.n_items, self.n_words, len(self.corp.train_votes)) ## Model parameter allocation self.theta_user = np.random.random((self.K, self.n_users)) self.theta_item = np.random.random((self.K, self.n_items)) self.phi1_review = np.zeros((self.K, len(self.corp.train_votes))) self.phi2_review = np.zeros((self.K, len(self.corp.vali_votes))) self.phi3_review = np.zeros((self.K, len(self.corp.test_votes))) self.beta_kw = np.zeros((self.n_words, self.K)) self.log_beta_kw = np.zeros((self.n_words, self.K)) if self.init_topic == 0: for i, line in enumerate(open(trtopic_path)): parts = line.strip("\r\t\n")[:-1].split(" ") parts = map(float, parts) self.phi1_review[:, i] = parts self.phi1_review[:, i] /= np.sum(self.phi1_review[:, i]) for i, line in enumerate(open(vatopic_path)): parts = line.strip("\r\t\n")[:-1].split(" ") parts = map(float, parts) self.phi2_review[:, i] = parts self.phi2_review[:, i] /= np.sum(self.phi2_review[:, i]) for i, line in enumerate(open(tetopic_path)): parts = line.strip("\r\t\n")[:-1].split(" ") parts = map(float, parts) self.phi3_review[:, i] = parts self.phi3_review[:, i] /= np.sum(self.phi3_review[:, i]) id_map_id = [-1 for i in xrange(self.n_words)] for i, line in enumerate(open(word_map_path)): if i == 0: continue parts = line.strip("\r\t\n").split(" ") word = parts[0] r_widx = int(parts[1]) if word not in self.corp.word_ids: print 'Word mismatch' sys.exit(1) widx = self.corp.word_ids[word] id_map_id[r_widx] = widx for i, line in enumerate(open(beta_path)): parts = line.strip("\r\t\n")[:-1].split(" ") parts = np.array(map(float, parts)) self.beta_kw[:,i] = parts[id_map_id] self.beta_kw[:,i] /= np.sum(self.beta_kw[:,i]) self.log_beta_kw = np.log(self.beta_kw) elif self.init_topic == 1: lda = Lda(self.K) (self.phi1_review, self.beta_kw) = lda.train(self.corp.train_votes, self.n_words) self.log_beta_kw = np.log(self.beta_kw) self.phi2_review = lda.inference(self.corp.vali_votes) self.phi3_review = lda.inference(self.corp.test_votes) else: print 'Invalid choice topic init method' sys.exit(1) self.train_votes_puser = [[] for u in xrange(self.n_users)] self.train_votes_pitem = [[] for i in xrange(self.n_items)] for i, vote in enumerate(self.corp.train_votes): uidx = vote.user iidx = vote.item self.train_votes_puser[uidx].append(i) self.train_votes_pitem[iidx].append(i) self.probValCheck() print "Finished model preprocessing"
class LDA_Value_Classifier(RVClassifier): #the number of hidden topics to take into account for the computation of lineraly constrained bayesian matrix factorization def __init__(self, k, vocab_size): RVClassifier.__init__(self, vocab_size) self.k = 20 if k != None: self.k = k self.train_gibbsiter = 40 self.nb_averages = 3 self.train_hyperparamsiter = 25 self.test_hyperparamsiter = 20 self.test_gibbsiter = 1 self.test_chainnb = 5 self.lda = None ''' train the classifier on the given train matrix train_data_matrix: numpy 2-dimentional matrix ''' def train(self, train_data_rv): self.lda = Lda(train_data_rv, self.k, self.vocab_size) self.lda.gibbs_iter = self.train_gibbsiter self.lda.hyperparams_iter = self.train_hyperparamsiter self.lda.hyper_parameter_number_of_samples_to_average = self.nb_averages self.lda.fit(self.train_hyperparamsiter, self.train_gibbsiter) #pdb.set_trace() ''' input: testdata , feature_to_classify output: a matrix [nb_records, nb_values] where each row represents the different prediction confidences of each value ''' def _compute_prediction_confidences(self, test_data_rv, ids_realizations_to_classify, v_groups): if self.lda == None: raise Exception( "NON TRAINED CLASSIFIER EXCEPTION: the classifier needs to be trained before if is able to make classifications" ) #fit the new data to lda model [test_theta, perplexity ] = self.lda.fit_newrecords(test_data_rv, self.test_hyperparamsiter, self.test_gibbsiter, self.test_chainnb) phi = self.lda.phi[:, ids_realizations_to_classify] #pdb.set_trace() #Pr(v|r)=sum(Pr(k|r)Pr(v|k)) = theta[r,:] . phi[:,v] predictions = np.dot(test_theta, phi) return predictions def compute_perplexity(self, test_data_rfv): raise Exception("Not implemented exception") return None ''' classify the test set elements test_data_rfv: test data in the rfv format ids_realizations_to_classify: the if of the feature to be classified (note that this feature must be absent from all the test records) v_groups: list containing the list of values that should be considered as unique classification class, first list are ids corresponding to class 1, second list to ids corresponding to class 2, ... return [ids , scores] where: ids: is a list that contains the id of the feature decided by the classifier for each vector element scores: is a list array that contains the scores corresponding to the decided features ''' def classify_with_no_regroup(self, test_data_rfv, ids_realizations_to_classify, v_groups): #get the prediction confidences for the different values of the indicate feature in each record r -> [nb_records, nb_values] predictions = self._compute_prediction_confidences( test_data_rfv, ids_realizations_to_classify, v_groups) winner_ranks = np.argmax(predictions, 1) winner_ids = np.asarray( [ids_realizations_to_classify[rank] for rank in winner_ranks]) winner_scores = np.max(predictions, 1) winner_classes = [] #transform the winner values to classes for v in winner_ids: for cid, cl in enumerate(v_groups): if v in cl: winner_classes.append(cid) break return [winner_classes, winner_scores] ''' classify the test set elements and then regroup the classified elements according to their group to decide a common class for them test_data_matrix: 2-dimentional numpy array where the columns represents the vectors to classify ids_features_to_classify: the ids of the hidden features that needs to be classified r_groups: [[id1_groupe1,...,id_n1_groupe1],...,[id_1_groupem, ..., id_nm_groupem]] group of records that get a common classification v_groups: list containing the list of values that should be considered as unique classification class return [ids , scores] where: ids: is a list that contains the id of the feature decided by the classifier for each vector element scores: is a list that contains the scores corresponding to the decided features ''' def regroup_and_classify(self, test_data_rfv, ids_realizations_to_classify, r_groups, v_groups): #get a matrix of the same size than the test matrix that contains the estimated values for the features to classify, and 0 otherwise else predictions = self._compute_prediction_confidences( test_data_rfv, ids_realizations_to_classify, v_groups) winner_ranks = [] winner_scores = [] #for each group, take the feature corresponding to the maximal value of the sum of the records belonging to the group for group in r_groups: group_record = np.prod(predictions[group, :], 0) winner_score = np.amax(group_record) winner_id = np.argmax(group_record) winner_ranks.append(winner_id) winner_scores.append(winner_score) winner_ids = np.asarray( [ids_realizations_to_classify[rank] for rank in winner_ranks]) winner_classes = [] #transform the winner values to classes for v in winner_ids: for cid, cl in enumerate(v_groups): if v in cl: winner_classes.append(cid) break return [winner_classes, winner_scores]
def train(self, train_data_rv): self.lda = Lda(train_data_rv, self.k, self.vocab_size) self.lda.gibbs_iter = self.train_gibbsiter; self.lda.hyperparams_iter = self.train_hyperparamsiter; self.lda.hyper_parameter_number_of_samples_to_average = self.nb_averages self.lda.fit(self.train_hyperparamsiter, self.train_gibbsiter)
class LDA_Value_Classifier (RVClassifier): #the number of hidden topics to take into account for the computation of lineraly constrained bayesian matrix factorization def __init__(self, k, vocab_size): RVClassifier.__init__(self, vocab_size) self.k = 20 if k != None: self.k=k self.train_gibbsiter = 40; self.nb_averages = 3; self.train_hyperparamsiter = 25 self.test_hyperparamsiter = 20; self.test_gibbsiter = 1; self.test_chainnb=5; self.lda = None ''' train the classifier on the given train matrix train_data_matrix: numpy 2-dimentional matrix ''' def train(self, train_data_rv): self.lda = Lda(train_data_rv, self.k, self.vocab_size) self.lda.gibbs_iter = self.train_gibbsiter; self.lda.hyperparams_iter = self.train_hyperparamsiter; self.lda.hyper_parameter_number_of_samples_to_average = self.nb_averages self.lda.fit(self.train_hyperparamsiter, self.train_gibbsiter) #pdb.set_trace() ''' input: testdata , feature_to_classify output: a matrix [nb_records, nb_values] where each row represents the different prediction confidences of each value ''' def _compute_prediction_confidences(self, test_data_rv, ids_realizations_to_classify, v_groups): if self.lda == None: raise Exception("NON TRAINED CLASSIFIER EXCEPTION: the classifier needs to be trained before if is able to make classifications") #fit the new data to lda model [test_theta, perplexity] = self.lda.fit_newrecords(test_data_rv, self.test_hyperparamsiter, self.test_gibbsiter, self.test_chainnb) phi = self.lda.phi[:,ids_realizations_to_classify] #pdb.set_trace() #Pr(v|r)=sum(Pr(k|r)Pr(v|k)) = theta[r,:] . phi[:,v] predictions = np.dot(test_theta,phi) return predictions def compute_perplexity(self, test_data_rfv): raise Exception("Not implemented exception") return None ''' classify the test set elements test_data_rfv: test data in the rfv format ids_realizations_to_classify: the if of the feature to be classified (note that this feature must be absent from all the test records) v_groups: list containing the list of values that should be considered as unique classification class, first list are ids corresponding to class 1, second list to ids corresponding to class 2, ... return [ids , scores] where: ids: is a list that contains the id of the feature decided by the classifier for each vector element scores: is a list array that contains the scores corresponding to the decided features ''' def classify_with_no_regroup(self, test_data_rfv, ids_realizations_to_classify,v_groups): #get the prediction confidences for the different values of the indicate feature in each record r -> [nb_records, nb_values] predictions = self._compute_prediction_confidences(test_data_rfv, ids_realizations_to_classify, v_groups) winner_ranks = np.argmax(predictions, 1) winner_ids = np.asarray([ids_realizations_to_classify[rank] for rank in winner_ranks]) winner_scores = np.max(predictions, 1) winner_classes = [] #transform the winner values to classes for v in winner_ids: for cid, cl in enumerate(v_groups): if v in cl: winner_classes.append(cid) break; return [winner_classes, winner_scores] ''' classify the test set elements and then regroup the classified elements according to their group to decide a common class for them test_data_matrix: 2-dimentional numpy array where the columns represents the vectors to classify ids_features_to_classify: the ids of the hidden features that needs to be classified r_groups: [[id1_groupe1,...,id_n1_groupe1],...,[id_1_groupem, ..., id_nm_groupem]] group of records that get a common classification v_groups: list containing the list of values that should be considered as unique classification class return [ids , scores] where: ids: is a list that contains the id of the feature decided by the classifier for each vector element scores: is a list that contains the scores corresponding to the decided features ''' def regroup_and_classify(self, test_data_rfv, ids_realizations_to_classify, r_groups, v_groups): #get a matrix of the same size than the test matrix that contains the estimated values for the features to classify, and 0 otherwise else predictions = self._compute_prediction_confidences(test_data_rfv, ids_realizations_to_classify, v_groups) winner_ranks = [] winner_scores = [] #for each group, take the feature corresponding to the maximal value of the sum of the records belonging to the group for group in r_groups: group_record = np.prod(predictions[group, :],0) winner_score = np.amax(group_record) winner_id = np.argmax(group_record) winner_ranks.append(winner_id) winner_scores.append(winner_score) winner_ids = np.asarray([ids_realizations_to_classify[rank] for rank in winner_ranks]) winner_classes = [] #transform the winner values to classes for v in winner_ids: for cid, cl in enumerate(v_groups): if v in cl: winner_classes.append(cid) break; return [winner_classes, winner_scores]