Exemple #1
0
def tp_one_trial(dataset, model_type, topic_size, sample_size, min_cf=3, rm_top=5,
             max_iter=1000, min_iter=None, checkpoint=None, stop_increase=1, metric='ll'):
    assert model_type in ['lda', 'ctm', 'slda', 'hdp'], f'invalid `model_type`: {model_type}...'
    assert metric in ['ll', 'pp'], f'invalid `metric`: {metric}...'
    if model_type == 'lda':
        model = tp.LDAModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    if model_type == 'ctm':
        model = tp.CTModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    if model_type == "slda":
        model = tp.SLDAModel(k=topic_size,vars="b", tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    if model_type == 'hdp':
        model = tp.HDPModel(initial_k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    sample_size = min(sample_size, len(dataset))
    
#     max_iter = max_iter * sample_size * topic_size // 2000  # ensure the number of iterations increases with the size of sample
    model.burn_in = max_iter // 5  # set burn-in: 20 percent of max iterations

    for i in range(sample_size):
        doc, label = dataset[i]
        if model_type == "slda":
            model.add_doc(doc,[float(label),])
        else:
            model.add_doc(doc)

    if min_iter is None:
        min_iter = max_iter // 5
    if checkpoint is None:
        checkpoint = max_iter // 5

    model.train(min_iter)

    pre_metric = - np.infty
    stop_increase_cnt = 0.
    cur_metric = 0.
    for i in range(1, max_iter+1):
        model.train(1)
        # Metric is always larger, better
        if metric == 'll':
            cur_metric += model.ll_per_word
        if metric == 'pp':
            cur_metric += - model.perplexity  # smaller perplexity is better.

        if i % checkpoint == 0:
            cur_metric /= checkpoint
            print(f'Current loss: {cur_metric:.5f}')
            if cur_metric >= pre_metric:
                pre_metric = cur_metric
            else:
                stop_increase_cnt += 1
            cur_metric = 0.

        if stop_increase_cnt >= stop_increase:
            break

    final_metric = model.perplexity if metric == 'pp' else model.ll_per_word

    print(f'Trial iterations: {i + min_iter}.')
    return model, final_metric
Exemple #2
0
    corpus = tp.utils.Corpus.load('preprocessed_20news.cps')
except IOError:
    porter_stemmer = nltk.PorterStemmer().stem
    english_stops = set(porter_stemmer(w) for w in stopwords.words('english'))
    pat = re.compile('^[a-z]{2,}$')
    corpus = tp.utils.Corpus(
        tokenizer=tp.utils.SimpleTokenizer(porter_stemmer),
        stopwords=lambda x: x in english_stops or not pat.match(x))
    newsgroups_train = fetch_20newsgroups()
    corpus.process(d.lower() for d in newsgroups_train.data)
    # save preprocessed corpus for reuse
    corpus.save('preprocessed_20news.cps')

mdl = tp.CTModel(tw=tp.TermWeight.IDF,
                 min_df=5,
                 rm_top=40,
                 k=30,
                 corpus=corpus)
mdl.train(0)

# Since we have more than ten thousand of documents,
# setting the `num_beta_sample` smaller value will not cause an inaccurate result.
mdl.num_beta_sample = 5
print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words))
print('Removed Top words: ', *mdl.removed_top_words)

# Let's train the model
for i in range(0, 1000, 20):
    print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word))
    mdl.train(20)
Exemple #3
0
import tomotopy as tp

model = tp.CTModel(k=2)
# print(model.get_correlations())
# print(model.get_correlations(0))
print(model.num_beta_sample)
print(model.num_tmn_sample)
# print(model.prior_cov)
# print(model.prior_mean)
# array = np.array(t5)
# print(array)
dataset = [d.split() for d in t5]
dictionary = corpora.Dictionary(dataset)
print(dictionary)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in dataset]
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word=dictionary, passes=50)
print(ldamodel.print_topics(num_topics=3, num_words=3))

# ********************************************************************************************
#
import tomotopy as tp

mdl = tp.CTModel(k=20)
for line in open('C://Users//bvjan//Documents//data2.txt'):
    mdl.add_doc(line.strip().split())

for i in range(0, 100, 10):
    mdl.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=2))
# **********************************************************************************************

import matplotlib.pyplot as plt

labels = t5