def train(self, train_data_rv): self.lda = Lda(train_data_rv, self.k, self.vocab_size) self.lda.gibbs_iter = self.train_gibbsiter self.lda.hyperparams_iter = self.train_hyperparamsiter self.lda.hyper_parameter_number_of_samples_to_average = self.nb_averages self.lda.fit(self.train_hyperparamsiter, self.train_gibbsiter)
def update_lda(self, cluster, data, data_lens, n_cat, n_em_itr, labels=None): n_states = cluster.n_states lda = self.lda_mdl(n_states, n_cat) if lda is None: states = cluster.expt_s.argmax(0) s_list = [ states[b:e] for b, e in _data_len_ibgn_iend_itr(data_lens) ] lda = Lda(n_states, n_cat) n_batches = len(data_lens) lda.set_default_params(n_batches) lda.init_expt_z(data_lens) lda.update(s_list, n_em_itr) self._lda_dic[(n_states, n_cat)] = lda return lda
if __name__ == '__main__': # Settings use_cache = False update_html = True # Retrieve and prepare dataset newsapi = NewsboxApi() published_since_n_days = (datetime.date.today() - datetime.timedelta(days=PUBLISHED_BEFORE_N_DAYS)).isoformat() articles = newsapi.list_articles(language=LANGUAGE, published_after=published_since_n_days, from_cache=use_cache) texts = prepare_articles(articles=articles, from_cache=use_cache) # Train LDA lda = Lda() lda.train_lda(texts=texts, num_topics=NUM_TOPIC) lda.persist_lda() lda.export_html() # lda.visualize() # Update lda html for newsmap if update_html: minioapi = MinioApi() bucket_name = 'newsmap' minioapi.create_bucket(bucket_name=bucket_name) minioapi.upload_file(bucket_name=bucket_name, filename='index.html', file='artifacts/lda/index.html') minioapi.make_public(bucket_name=bucket_name)
def __init__(self, trdata_path, vadata_path, tedata_path, trtopic_path, vatopic_path, tetopic_path, beta_path, word_map_path): ## Parameters settings self.train_iters = 100 self.infer_iters = 10 self.grad_iters = 100 self.K = 40 self.max_words = 10000 self.l_u = 0.1 self.l_i = 0.1 self.init_topic = 0 # 0=init from files, 1=init from implemented by us ## Reading review data self.trdata_path = trdata_path self.vadata_path = vadata_path self.tedata_path = tedata_path self.corp = Corpus(trdata_path, vadata_path, tedata_path, self.max_words) self.n_users = self.corp.n_users self.n_items = self.corp.n_items self.n_words = self.corp.n_words print 'N_users=%d, N_items=%d, N_words=%d, N_trreviews=%d' % (self.n_users, self.n_items, self.n_words, len(self.corp.train_votes)) ## Model parameter allocation self.theta_user = np.random.random((self.K, self.n_users)) self.theta_item = np.random.random((self.K, self.n_items)) self.phi1_review = np.zeros((self.K, len(self.corp.train_votes))) self.phi2_review = np.zeros((self.K, len(self.corp.vali_votes))) self.phi3_review = np.zeros((self.K, len(self.corp.test_votes))) self.beta_kw = np.zeros((self.n_words, self.K)) self.log_beta_kw = np.zeros((self.n_words, self.K)) if self.init_topic == 0: for i, line in enumerate(open(trtopic_path)): parts = line.strip("\r\t\n")[:-1].split(" ") parts = map(float, parts) self.phi1_review[:, i] = parts self.phi1_review[:, i] /= np.sum(self.phi1_review[:, i]) for i, line in enumerate(open(vatopic_path)): parts = line.strip("\r\t\n")[:-1].split(" ") parts = map(float, parts) self.phi2_review[:, i] = parts self.phi2_review[:, i] /= np.sum(self.phi2_review[:, i]) for i, line in enumerate(open(tetopic_path)): parts = line.strip("\r\t\n")[:-1].split(" ") parts = map(float, parts) self.phi3_review[:, i] = parts self.phi3_review[:, i] /= np.sum(self.phi3_review[:, i]) id_map_id = [-1 for i in xrange(self.n_words)] for i, line in enumerate(open(word_map_path)): if i == 0: continue parts = line.strip("\r\t\n").split(" ") word = parts[0] r_widx = int(parts[1]) if word not in self.corp.word_ids: print 'Word mismatch' sys.exit(1) widx = self.corp.word_ids[word] id_map_id[r_widx] = widx for i, line in enumerate(open(beta_path)): parts = line.strip("\r\t\n")[:-1].split(" ") parts = np.array(map(float, parts)) self.beta_kw[:,i] = parts[id_map_id] self.beta_kw[:,i] /= np.sum(self.beta_kw[:,i]) self.log_beta_kw = np.log(self.beta_kw) elif self.init_topic == 1: lda = Lda(self.K) (self.phi1_review, self.beta_kw) = lda.train(self.corp.train_votes, self.n_words) self.log_beta_kw = np.log(self.beta_kw) self.phi2_review = lda.inference(self.corp.vali_votes) self.phi3_review = lda.inference(self.corp.test_votes) else: print 'Invalid choice topic init method' sys.exit(1) self.train_votes_puser = [[] for u in xrange(self.n_users)] self.train_votes_pitem = [[] for i in xrange(self.n_items)] for i, vote in enumerate(self.corp.train_votes): uidx = vote.user iidx = vote.item self.train_votes_puser[uidx].append(i) self.train_votes_pitem[iidx].append(i) self.probValCheck() print "Finished model preprocessing"