def train(self, train_data_rv):

        self.lda = Lda(train_data_rv, self.k, self.vocab_size)
        self.lda.gibbs_iter = self.train_gibbsiter
        self.lda.hyperparams_iter = self.train_hyperparamsiter
        self.lda.hyper_parameter_number_of_samples_to_average = self.nb_averages
        self.lda.fit(self.train_hyperparamsiter, self.train_gibbsiter)
Ejemplo n.º 2
0
 def update_lda(self,
                cluster,
                data,
                data_lens,
                n_cat,
                n_em_itr,
                labels=None):
     n_states = cluster.n_states
     lda = self.lda_mdl(n_states, n_cat)
     if lda is None:
         states = cluster.expt_s.argmax(0)
         s_list = [
             states[b:e] for b, e in _data_len_ibgn_iend_itr(data_lens)
         ]
         lda = Lda(n_states, n_cat)
         n_batches = len(data_lens)
         lda.set_default_params(n_batches)
         lda.init_expt_z(data_lens)
     lda.update(s_list, n_em_itr)
     self._lda_dic[(n_states, n_cat)] = lda
     return lda
Ejemplo n.º 3
0
if __name__ == '__main__':
    # Settings
    use_cache = False
    update_html = True

    # Retrieve and prepare dataset
    newsapi = NewsboxApi()
    published_since_n_days = (datetime.date.today() - datetime.timedelta(days=PUBLISHED_BEFORE_N_DAYS)).isoformat()
    articles = newsapi.list_articles(language=LANGUAGE,
                                     published_after=published_since_n_days,
                                     from_cache=use_cache)
    texts = prepare_articles(articles=articles, from_cache=use_cache)

    # Train LDA
    lda = Lda()
    lda.train_lda(texts=texts, num_topics=NUM_TOPIC)
    lda.persist_lda()
    lda.export_html()
    # lda.visualize()

    # Update lda html for newsmap
    if update_html:
        minioapi = MinioApi()
        bucket_name = 'newsmap'
        minioapi.create_bucket(bucket_name=bucket_name)
        minioapi.upload_file(bucket_name=bucket_name,
                             filename='index.html',
                             file='artifacts/lda/index.html')
        minioapi.make_public(bucket_name=bucket_name)
Ejemplo n.º 4
0
    def __init__(self, trdata_path, vadata_path, tedata_path, trtopic_path, vatopic_path, tetopic_path, beta_path, word_map_path):
        ## Parameters settings
        self.train_iters = 100
        self.infer_iters = 10
        self.grad_iters = 100
        self.K = 40
        self.max_words = 10000
        self.l_u = 0.1
        self.l_i = 0.1
        self.init_topic = 0        # 0=init from files, 1=init from implemented by us

        ## Reading review data
        self.trdata_path = trdata_path
        self.vadata_path = vadata_path
        self.tedata_path = tedata_path
        self.corp = Corpus(trdata_path, vadata_path, tedata_path, self.max_words)
        self.n_users = self.corp.n_users
        self.n_items = self.corp.n_items
        self.n_words = self.corp.n_words
        print 'N_users=%d, N_items=%d, N_words=%d, N_trreviews=%d' % (self.n_users, self.n_items, self.n_words, len(self.corp.train_votes))

        ## Model parameter allocation
        self.theta_user = np.random.random((self.K, self.n_users))
        self.theta_item = np.random.random((self.K, self.n_items))
        self.phi1_review = np.zeros((self.K, len(self.corp.train_votes)))
        self.phi2_review = np.zeros((self.K, len(self.corp.vali_votes)))
        self.phi3_review = np.zeros((self.K, len(self.corp.test_votes)))
        self.beta_kw = np.zeros((self.n_words, self.K))
        self.log_beta_kw = np.zeros((self.n_words, self.K))
        if self.init_topic == 0:
            for i, line in enumerate(open(trtopic_path)):
                parts = line.strip("\r\t\n")[:-1].split(" ")
                parts = map(float, parts)
                self.phi1_review[:, i] = parts
                self.phi1_review[:, i] /= np.sum(self.phi1_review[:, i])
            for i, line in enumerate(open(vatopic_path)):
                parts = line.strip("\r\t\n")[:-1].split(" ")
                parts = map(float, parts)
                self.phi2_review[:, i] = parts
                self.phi2_review[:, i] /= np.sum(self.phi2_review[:, i])
            for i, line in enumerate(open(tetopic_path)):
                parts = line.strip("\r\t\n")[:-1].split(" ")
                parts = map(float, parts)
                self.phi3_review[:, i] = parts
                self.phi3_review[:, i] /= np.sum(self.phi3_review[:, i])
            id_map_id = [-1 for i in xrange(self.n_words)]
            for i, line in enumerate(open(word_map_path)):
                if i == 0:
                    continue
                parts = line.strip("\r\t\n").split(" ")
                word = parts[0]
                r_widx = int(parts[1])
                if word not in self.corp.word_ids:
                    print 'Word mismatch'
                    sys.exit(1)
                widx = self.corp.word_ids[word]
                id_map_id[r_widx] = widx
            for i, line in enumerate(open(beta_path)):
                parts = line.strip("\r\t\n")[:-1].split(" ")
                parts = np.array(map(float, parts))
                self.beta_kw[:,i] = parts[id_map_id]
                self.beta_kw[:,i] /= np.sum(self.beta_kw[:,i])
            self.log_beta_kw = np.log(self.beta_kw)
        elif self.init_topic == 1:
            lda = Lda(self.K)
            (self.phi1_review, self.beta_kw) = lda.train(self.corp.train_votes, self.n_words)
            self.log_beta_kw = np.log(self.beta_kw)
            self.phi2_review = lda.inference(self.corp.vali_votes)
            self.phi3_review = lda.inference(self.corp.test_votes)
        else:
            print 'Invalid choice topic init method'
            sys.exit(1)

        self.train_votes_puser = [[] for u in xrange(self.n_users)]
        self.train_votes_pitem = [[] for i in xrange(self.n_items)]
        for i, vote in enumerate(self.corp.train_votes):
            uidx = vote.user
            iidx = vote.item
            self.train_votes_puser[uidx].append(i)
            self.train_votes_pitem[iidx].append(i)
        self.probValCheck()
        print "Finished model preprocessing"