def tp_one_trial(dataset, model_type, topic_size, sample_size, min_cf=3, rm_top=5, max_iter=1000, min_iter=None, checkpoint=None, stop_increase=1, metric='ll'): assert model_type in ['lda', 'ctm', 'slda', 'hdp'], f'invalid `model_type`: {model_type}...' assert metric in ['ll', 'pp'], f'invalid `metric`: {metric}...' if model_type == 'lda': model = tp.LDAModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top) if model_type == 'ctm': model = tp.CTModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top) if model_type == "slda": model = tp.SLDAModel(k=topic_size,vars="b", tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top) if model_type == 'hdp': model = tp.HDPModel(initial_k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top) sample_size = min(sample_size, len(dataset)) # max_iter = max_iter * sample_size * topic_size // 2000 # ensure the number of iterations increases with the size of sample model.burn_in = max_iter // 5 # set burn-in: 20 percent of max iterations for i in range(sample_size): doc, label = dataset[i] if model_type == "slda": model.add_doc(doc,[float(label),]) else: model.add_doc(doc) if min_iter is None: min_iter = max_iter // 5 if checkpoint is None: checkpoint = max_iter // 5 model.train(min_iter) pre_metric = - np.infty stop_increase_cnt = 0. cur_metric = 0. for i in range(1, max_iter+1): model.train(1) # Metric is always larger, better if metric == 'll': cur_metric += model.ll_per_word if metric == 'pp': cur_metric += - model.perplexity # smaller perplexity is better. if i % checkpoint == 0: cur_metric /= checkpoint print(f'Current loss: {cur_metric:.5f}') if cur_metric >= pre_metric: pre_metric = cur_metric else: stop_increase_cnt += 1 cur_metric = 0. if stop_increase_cnt >= stop_increase: break final_metric = model.perplexity if metric == 'pp' else model.ll_per_word print(f'Trial iterations: {i + min_iter}.') return model, final_metric
corpus = tp.utils.Corpus.load('preprocessed_20news.cps') except IOError: porter_stemmer = nltk.PorterStemmer().stem english_stops = set(porter_stemmer(w) for w in stopwords.words('english')) pat = re.compile('^[a-z]{2,}$') corpus = tp.utils.Corpus( tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), stopwords=lambda x: x in english_stops or not pat.match(x)) newsgroups_train = fetch_20newsgroups() corpus.process(d.lower() for d in newsgroups_train.data) # save preprocessed corpus for reuse corpus.save('preprocessed_20news.cps') mdl = tp.CTModel(tw=tp.TermWeight.IDF, min_df=5, rm_top=40, k=30, corpus=corpus) mdl.train(0) # Since we have more than ten thousand of documents, # setting the `num_beta_sample` smaller value will not cause an inaccurate result. mdl.num_beta_sample = 5 print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( len(mdl.docs), len(mdl.used_vocabs), mdl.num_words)) print('Removed Top words: ', *mdl.removed_top_words) # Let's train the model for i in range(0, 1000, 20): print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word)) mdl.train(20)
import tomotopy as tp model = tp.CTModel(k=2) # print(model.get_correlations()) # print(model.get_correlations(0)) print(model.num_beta_sample) print(model.num_tmn_sample) # print(model.prior_cov) # print(model.prior_mean)
# array = np.array(t5) # print(array) dataset = [d.split() for d in t5] dictionary = corpora.Dictionary(dataset) print(dictionary) doc_term_matrix = [dictionary.doc2bow(doc) for doc in dataset] Lda = gensim.models.ldamodel.LdaModel ldamodel = Lda(doc_term_matrix, num_topics=3, id2word=dictionary, passes=50) print(ldamodel.print_topics(num_topics=3, num_words=3)) # ******************************************************************************************** # import tomotopy as tp mdl = tp.CTModel(k=20) for line in open('C://Users//bvjan//Documents//data2.txt'): mdl.add_doc(line.strip().split()) for i in range(0, 100, 10): mdl.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) for k in range(mdl.k): print('Top 10 words of topic #{}'.format(k)) print(mdl.get_topic_words(k, top_n=2)) # ********************************************************************************************** import matplotlib.pyplot as plt labels = t5