Beispiel #1
0
def get_lda(text_dictionary):
    train = []

    for key, line in text_dictionary.items():
        line = line.strip().split(' ')
        train.append(line)

    print(len(train))
    print(' '.join(train[2]))

    dictionary = corpora.Dictionary(train)
    corpus = [dictionary.doc2bow(text) for text in train]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

    topic_list = lda.print_topics(20)
    print(type(lda.print_topics(20)))
    print(len(lda.print_topics(20)))

    for topic in topic_list:
        print(topic)
    print("第一主题")
    print(lda.print_topic(1))

    print('给定一个新文档,输出其主题分布')

    # test_doc = list(new_doc) #新文档进行分词
    test_doc = train[2]  # 查看训练集中第三个样本的主题分布
    doc_bow = dictionary.doc2bow(test_doc)  # 文档转换成bow
    doc_lda = lda[doc_bow]  # 得到新文档的主题分布
    # 输出新文档的主题分布
    print(doc_lda)
    for topic in doc_lda:
        print("%s\t%f\n" % (lda.print_topic(topic[0]), topic[1]))
Beispiel #2
0
def save_model():
    """
    保存LDA模型
    :param model_path:
    :return:
    -----------------
    corpus:[
            [('词ID', 词频),('词ID', 词频)...],
            [('词ID', 词频),('词ID', 词频)...],
            .......
            ] 稀疏向量集
    id2word: {'词1':0, '词2':1. ..}

    """
    train_set = get_train_set()
    word_dict = Dictionary(train_set)  # 生成文档的词典,每个词与一个整型索引值对应
    corpus_list = [word_dict.doc2bow(text)
                   for text in train_set]  # 词频统计,转化成空间向量格式
    lda = LdaModel(
        corpus=corpus_list,
        id2word=word_dict,
        num_topics=100,
        # passes=5, # epoch
        alpha='auto')
    lda.print_topic(99)
    # 保存LDA 模型
    lda.save(lda_model_path)
Beispiel #3
0
def testLDA(file):
    file = open(file, encoding='UTF-8')
    train = []
    try:
        lines = file.readlines()
        stopwords = open("file\\stopwords_cn.txt",
                         encoding='UTF-8').readlines()
        stopwords = [w.strip() for w in stopwords]
        for line in lines:
            line = line.split()
            train.append([w for w in line if w not in stopwords])
    finally:
        file.close()
        dict = corpora.Dictionary(train)  #自建词典
        # 通过dict将用字符串表示的文档转换为用id表示的文档向量
        corpus = [dict.doc2bow(text) for text in train]
        lda = LdaModel(corpus=corpus, id2word=dict, num_topics=20)
        lda.print_topic(2)
        lda.save('file\\lda.model')
Beispiel #4
0
def lda():
    # remove stop words
    stopwords = codecs.open('../conf/stop_words_ch.txt', mode='r', encoding='utf8').readlines()
    stopwords = [ w.strip() for w in stopwords ]
    
    fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8')
    train = []
    for line in fp:
        line = line.split()
        train.append([ w for w in line if w not in stopwords ])
    
    dictionary = corpora.Dictionary(train)
    corpus = [ dictionary.doc2bow(text) for text in train ]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)
    
    lda.print_topics(30)
    # print topic id=20
    lda.print_topic(20)
    
    # save/load model
    lda.save('D:\\nlp\corpora\news.model')
Beispiel #5
0
def lda():
    # remove stop words
    stopwords = codecs.open('../conf/stop_words_ch.txt',
                            mode='r',
                            encoding='utf8').readlines()
    stopwords = [w.strip() for w in stopwords]

    fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8')
    train = []
    for line in fp:
        line = line.split()
        train.append([w for w in line if w not in stopwords])

    dictionary = corpora.Dictionary(train)
    corpus = [dictionary.doc2bow(text) for text in train]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

    lda.print_topics(30)
    # print topic id=20
    lda.print_topic(20)

    # save/load model
    lda.save('D:\\nlp\corpora\news.model')
Beispiel #6
0
def create_gensim_lda_model(doc_clean,number_of_topics):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LDA model
    ldamodel = LdaModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary, alpha="auto", eval_every=5)  # train model
    #print(ldamodel.print_topics(num_topics=number_of_topics, num_words=words))
    #ldamodel.print_topics(-1)
    for i in range(0, ldamodel.num_topics-1):
        print(ldamodel.print_topic(i))
    return ldamodel, dictionary, doc_term_matrix
Beispiel #7
0
def extract_topics(words):
    word_id_map=Dictionary([words])
    word_id_map.filter_tokens([id for id, occurance in word_id_map.dfs.iteritems() if occurance == 2])
    word_id_map.compactify()
    deals_corpus=[word_id_map.doc2bow(words)]
    lda=LdaModel(corpus=deals_corpus, id2word=word_id_map, num_topics=15, update_every=1, chunksize=1000,passes=1)
    topics=[]
    for i in range(15):
        tokens=lda.print_topic(i).split('+')
        topic_scores=[]
        for token in tokens:
            score,token_val=token.split('*')
            topic_scores.append((token_val,score))
        topics.append(topic_scores)
    return topics
Beispiel #8
0
def get_topics_lda(tokens, n_topics=10):
    """
    Using the `gensim` package for LDA. 
    LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia).
    `gensim` is a package for topic modeling only. So for a particular topic modeling task,
    it is a lighter option to install and run. Also it can be run distributed and updated over an existing model

    :param tokens: Preprocessed tokens for faster dictionary building
    :param n_topics: Number of topics to decompose data to
    :return: list() of topics
    """
    dict_file = 'resources/deals.dict'
    if not os.path.isfile(dict_file):
        print "Dictionary file does not exist. Creating one"
        dictionary = Dictionary(tokens)
        freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1]
        dictionary.filter_tokens(freq1)
        dictionary.compactify()
        dictionary.save(dict_file)
    dictionary = Dictionary.load(dict_file)
    # print dictionary

    corpus_file = 'resources/deals.mm'
    if not os.path.isfile(corpus_file):
        print "Corpus file does not exist. Creating one"
        corpus = [dictionary.doc2bow(token) for token in tokens]
        MmCorpus.serialize(corpus_file, corpus)
    mm = MmCorpus(corpus_file)
    # print mm
    # tfidf = TfidfModel(mm)
    # corpus_tfidf = tfidf[mm]

    lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000,
                   passes=1)
    topics = []
    for i in range(0, n_topics):
        words = lda.print_topic(i).split('+')
        topic = []
        for word in words:
            score, w = word.split('*')
            topic.append((w, score))
        topics.append(topic)
    return topics
Beispiel #9
0
class LDA_result(object):
    
    def __init__(self, abtract_complete_true,num_topics = 4, chunksize = 1000, passes = 60, iterations = 600, eval_every = None):
        self.num_journal = len(abtract_complete_true)
        self.abtract_complete_true = abtract_complete_true
        self.abtract_complete = self.abtract_complete_combination()
        self.dictionary = corpora.Dictionary(self.abtract_complete)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.abtract_complete]    
        self.temp = self.dictionary[0]
        self.id2word = self.dictionary.id2token
        self.num_topics = num_topics
        self.chunksize = chunksize
        self.passes = passes
        self.iterations = iterations
        self.eval_every = eval_every
        self.model = LdaModel(corpus = self.corpus, id2word = self.id2word, chunksize = self.chunksize, \
                       alpha='auto', eta='auto', \
                       iterations = self.iterations, num_topics = self.num_topics, \
                       passes = self.passes, eval_every = self.eval_every)           #建立模型    
    
    #组合摘要词汇
    def abtract_complete_combination(self):
        abtract_complete = []
        for journal_word_list in self.abtract_complete_true.values():
            abtract_complete.append(journal_word_list)
        return abtract_complete
                
    ##描述情况
    def description(self):
        print('Number of unique tokens: %d' % len(self.dictionary))
        print('Number of documents: %d' % len(self.corpus))
    
    #转tfidf向量    
    def word2tfidf(self):
        tfidf = models.TfidfModel(self.corpus)
        corpusTfidf = tfidf[self.corpus]
        return corpusTfidf
    
    #输出各个主题关键词
    def key_words(self):
        top_topics = self.model.top_topics(self.corpus)
        pprint(top_topics) 
        
    #每一行包含了主题词和主题词的权重
    def key_weight(self):
        print(self.model.print_topic(0,10))
        print(self.model.print_topic(1,10))  
    
    #判断第一个训练集文档属于哪个主题,没什么卵用凑个数
    def topic_belong(self):
        for index, score in sorted(self.model[self.corpus[0]], key=lambda tup: -1*tup[1]):
            print("Score: {}\n Topic: {}".format(score, self.model.print_topic(index, 10)))
    
    #LDA进行可视化
    def visible(self):
        vis_wrapper = pyLDAvis.gensim.prepare(self.model,self.corpus,self.dictionary)
        pyLDAvis.display(vis_wrapper)
        pyLDAvis.save_html(vis_wrapper,"lda%dtopics.html"%self.num_topics)
        pyLDAvis.show(vis_wrapper)
    
    #给训练集输出其属于不同主题概率  
    def community_belong(self):
        journal_community = {}
        for i,element in enumerate(abtract_complete_true):
            journal_community[element] = []
            for index, score in sorted(self.model[self.corpus[i]], key=lambda tup: -1*tup[1]):
                if score > 0.2:
                    journal_community[element].append(str(index))
                print(index, score)
        return journal_community

    #给定新的语料    
#    @staticmethod
#    def word_corpus(abtract_complete):
#        dictionary = corpora.Dictionary(abtract_complete)
#        corpus = [dictionary.doc2bow(text) for text in abtract_complete]  
#        return corpus
    
    #判断新预料的主题归属
    def identify_community(self, abtract_complete):
        corpus = self.dictionary.doc2bow(abtract_complete)
        community = []
        for index, score in sorted(self.model[corpus], key=lambda tup: -1*tup[1]):
            if score > 0.2:
                community.append(str(index)) 
        return community
model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

top_topics = model.top_topics(corpus, 5)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

model.print_topic(1, 30)
model.print_topic(3, 30)

#判断一个训练集文档属于哪个主题
for index, score in sorted(model[corpus[0]], key=lambda tup: -1 * tup[1]):
    print("Score: {}\t Topic: {}".format(score, model.print_topic(index, 10)))

#给训练集输出其属于不同主题概率
for index, score in sorted(model[corpus[0]], key=lambda tup: -1 * tup[1]):
    print(index, score)

#判断一个测试集文档属于哪个主题
#unseen_document = [" ".join(text_i) for text_i in clean_text4[130]]
#unseen_document = " ".join(unseen_document)

unseen_document = text[130]
Beispiel #11
0
titles = train_data['title'].tolist()

# 1 加载语料
with open('../data/train_docs.pkl', 'rb') as in_data:
    train_docs = pickle.load(in_data)

train_docs = [[word for word in doc.split(' ')] for doc in train_docs]

dictionary = corpora.Dictionary(train_docs)
corpus = [dictionary.doc2bow(text) for text in train_docs]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=6)

topic_list = lda.print_topics(20)
# print(type(lda.print_topics(20)))

for topic in topic_list:
    print(topic)
print("第一主题", lda.print_topic(1))
print('给定一个新文档,输出其主题分布')

# test_doc = list(new_doc) #新文档进行分词
test_doc = train_docs[2]  # 查看训练集中第三个样本的主题分布
print(test_doc)
doc_bow = dictionary.doc2bow(test_doc)  # 文档转换成bow
doc_lda = lda[doc_bow]  # 得到新文档的主题分布
# 输出新文档的主题分布
print(doc_lda)

for topic in doc_lda:
    print("%s\t%f\n" % (lda.print_topic(topic[0]), topic[1]))
Beispiel #12
0

# Code starts here

# Calling the function
topic_list, coherence_value_list = compute_coherence_values(
    dictionary=dictionary,
    corpus=doc_term_matrix,
    texts=doc_clean,
    start=1,
    limit=41,
    step=5)
print(coherence_value_list)
# Finding the index associated with maximum coherence value
max_index = coherence_value_list.index(max(coherence_value_list))

# Finding the optimum no. of topics associated with the maximum coherence value
opt_topic = topic_list[max_index]
print("Optimum no. of topics:", opt_topic)

# Implementing LDA with the optimum no. of topic
lda_model = LdaModel(corpus=doc_term_matrix,
                     num_topics=opt_topic,
                     id2word=dictionary,
                     iterations=10,
                     passes=30,
                     random_state=0)

# pprint(lda_model.print_topics(5))
lda_model.print_topic(1)
Beispiel #13
0
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import LdaModel

pos_com = pd.read_csv('data/pos_com.csv', header=None, index_col=0)
neg_com = pd.read_csv('data/neg_com.csv', header=None, index_col=0)

# 正向评价
pos_com.columns = ['comment']
mid = list(pos_com['comment'].str.split(' '))
dictionary = Dictionary(mid)
bow = [dictionary.doc2bow(com) for com in mid]
# 模型构建
pos_model = LdaModel(corpus=bow, id2word=dictionary, num_topics=3)
pos_model.print_topic(0)
pos_model.print_topic(1)
pos_model.print_topic(2)

# 负面评价
neg_com.columns = ['comment']
mid = list(neg_com['comment'].str.split(' '))
dictionary = Dictionary(mid)
bow = [dictionary.doc2bow(com) for com in mid]
# 模型构建
neg_model = LdaModel(corpus=bow, id2word=dictionary, num_topics=3)
neg_model.print_topic(0)
neg_model.print_topic(1)
neg_model.print_topic(2)
Beispiel #14
0
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

#Train the model on the corpus
lda = LdaModel(common_corpus, num_topics=10)
'''一步步拆解来看,首先common_texts是list形式,里面的每一个元素都可以认为是一篇文档也是list结构:'''
print(type(common_texts))
print(common_texts[0])
'''第二步,doc2bow这个方法用于将文本转化为词袋形式,看一个官方的示例大家应该就能明白了,'''
from gensim.corpora import Dictionary
dct = Dictionary(["máma mele maso".split(), "ema má máma".split()])
print(dct.doc2bow(["this", "is", "máma"]))
print(dct.doc2bow(["this", "is", "máma"], return_missing=True))
'''初始化的时候对每一个词都会生成一个id,新的文本进去的时候,返回该文本每一个词的id,和对应的频数,对于那些不存在原词典的,可以控制是否返回。
此时生成的corpus就相当于是LDA训练模型的输入了,让我们检查一下:'''
print(common_corpus[0])
# human单词的id为0,且在第一个文档中只出现了一次
'''最后一步,我们只需调用LDA模型即可,这里指定了10个主题。'''
from gensim.models import LdaModel
lda = LdaModel(common_corpus, num_topics=10)
'''让我们检查一下结果(还有很多种方法大家可以看文档),比如我们想看第一个主题由哪些单词构成:'''
print(lda.print_topic(1, topn=2))
'''可以看出第一个模型的词分布,9号10号占比较大(这里topn控制了输出的单词个数,对应的单词可以通过之前生成dict找出)
我们还可以对刚才生成的lda模型用新语料去进行更新,'''
'''
# 能更新全部参数
lda.update(other_corpus)
#还能单独更新主题分布, 输入为之前的参数,其中rho指学习率
lda.update_alpha(gammat, rho)
#还能单独更新词分布
lda.update_eta(lambdat, rho)
'''
Beispiel #15
0
    mydict, corpus = tf_idf(documents)

    # Save the Dict and Corpus
    #mydict.save('mydict.dict')  # save dict to disk
    #corpora.MmCorpus.serialize('bow_corpus.mm', corpus)  # save corpus to disk

    # Load them back
    '''
    mydict = corpora.Dictionary.load('mydict.dict')
    corpus = corpora.MmCorpus('bow_corpus.mm')
    '''
    nb_topic = 3
    lda = LdaModel(corpus,
                   id2word=mydict,
                   num_topics=nb_topic,
                   passes=2,
                   per_word_topics=False,
                   iterations=200)
    #lda.save('lda_model.model')
    ldaa = []
    for i in range(0, nb_topic):
        d = re.findall('"([^"]*)"', lda.print_topic(i, 300))
        for word in d:
            if len(word) > 3 and len(ldaa) % 3 != 4 and word not in ldaa:
                ldaa.append(word)
    result = {}
    result['lda'] = ldaa[0:20]
    print(result)

    #print(lda.show_topics(num_topics=2, num_words=500, log=True))
Beispiel #16
0
ax1.set_xticks(x)
fig.tight_layout()
plt.show()
plt.savefig('work/metrics.png')

# %%
NUM_TOPICS = 4
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=NUM_TOPICS,
                     random_state=0)
lda_model.save('work/lda.model')

# %%
for i in range(lda_model.num_topics):
    print('TOPIC:', i, '__', lda_model.print_topic(i))

# %%
# WordCloud
# 日本語フォントをダウンロードしてwork以下に設置
fig, axs = plt.subplots(ncols=2,
                        nrows=math.ceil(lda_model.num_topics / 2),
                        figsize=(16, 20))
axs = axs.flatten()


def color_func(word, font_size, position, orientation, random_state,
               font_path):
    return 'darkturquoise'

texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

from collections import defaultdict

frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] >= 1]
         for text in texts]

from pprint import pprint  # pretty-printer

dictionary = corpora.Dictionary(texts)
# dictionary.save('/tmp/deerwester.dict') # store the dictionary, for future reference
# print(dictionary)
corpus = [dictionary.doc2bow(text) for text in texts]
# corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)

lda = LdaModel(corpus, num_topics=2)

# on a new document:
new_doc = "pretty obvious that when i write my tellall memoir someday there will be four to six"
new_vec = dictionary.doc2bow(new_doc.lower().split())

print(lda.print_topic(0))
print(lda.show_topic(1))
print(lda.get_document_topics(new_vec))
Beispiel #18
0
                 eta='auto',
                 iterations=iterations,
                 num_topics=num_topics,
                 passes=passes,
                 eval_every=eval_every)

print(model.print_topics())

for i in range(len(articleurls)):
    print("----")
    print(articleurls[i])
    topics = model.get_document_topics(corpus[i])

    for topic in topics:
        print("Topic:" + str(topic[0]))
        print(model.print_topic(topic[0]))
        print(topic[1])

    print("----")

#wordcloud = WordCloud(
#    background_color="white",
#    max_words=5000,
#    contour_width=3,
#    contour_color='steelblue',
#    width=1600,
#    height=800
#)
#
#wordcloud.generate_from_text(complete)
#wordcloud.to_file('wordcloud.png')
Beispiel #19
0
print(lda.get_document_topics(test))
print(lda[test])

# 参数(word_id, minimum_probability=None)
# 关联的topics for the given word.
# Each topic is represented as a tuple of (topic_id, term_probability).
print(lda.get_term_topics(0))

# ----- 输出指定topic的构成 -----
# 参数(word_id, minimum_probability=None)
# 输出形式 list, format: [(word, probability), … ].
print(lda.get_topic_terms(0))
# 参数(topicno, topn=10)
print(lda.show_topic(0))
# 输出形式 String, format: ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘.
# 参数(topicno, topn=10)
print(lda.print_topic(0))

# ----- 输出所有topic的构成 -----
# 默认参数(num_topics=10, num_words=10, log=False, formatted=True)
# 输出形式 String, format: [(0, ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘), ...]
print(lda.show_topics())
# [num_topics, vocabulary_size] array of floats (self.dtype)
# which represents the term topic matrix learned during inference.
print(lda.get_topics())

# ----- save and load model -----
lda.save(fname="lda_model")
lda.load(fname="lda_model")
print(lda[test])
Beispiel #20
0
class LdaModelHelper:

    status_scheduled = 'scheduled'
    status_computing = 'computing'
    status_completed = 'completed'
    status_error = 'killed'

    default_use_lemmer = True
    default_min_df = 2
    default_max_df = 0.8

    def __init__(self,
                 training_number_of_topics_to_extract,
                 language,
                 training_use_lemmer=True,
                 training_min_df=2,
                 training_max_df=0.8,
                 chunksize=2000,
                 passes=2):
        """

        :rtype: LdaModelHelper
        :param training_use_lemmer:
        :param training_min_df: int or float, min document frequency / document proportion (if float < 1)
        to consider a term in the model
        :param training_max_df: int or float, max document frequency / document proportion (if float < 1)
        to consider a term in the model
        """

        self.language = language

        self.analysis_use_lemmer = LdaModelHelper.default_use_lemmer
        self.analysis_min_df = LdaModelHelper.default_min_df
        self.analysis_max_df = LdaModelHelper.default_max_df

        self.analysis_corpus = None
        self.analysis_features_names = None
        self.analysis_documents = None

        self.training_number_of_topics_to_extract = training_number_of_topics_to_extract
        self.training_use_lemmer = training_use_lemmer
        self.training_min_df = training_min_df
        self.training_max_df = training_max_df
        self.chunksize = chunksize
        self.passes = passes

        self.training_corpus = None
        self.training_features_names = None
        self.analysis_documents = None
        self.training_documents = None

        self.lda_model = None
        self.model_computation_time = None

        self.topic_labels = None
        self.topic_assignment = None

    def set_analysis_parameters(self,
                                analysis_use_lemmer=True,
                                analysis_min_df=2,
                                analysis_max_df=0.8):

        self.analysis_use_lemmer = analysis_use_lemmer
        self.analysis_min_df = analysis_min_df
        self.analysis_max_df = analysis_max_df

        # reset related fields
        self.topic_assignment = None
        self.topic_labels = None
        self.analysis_corpus = None
        self.analysis_features_names = None
        self.analysis_documents = None

    def generate_model_filename(self):
        return "_".join([
            str(time.time()),
            str(self.training_number_of_topics_to_extract),
            str(self.training_min_df),
            str(self.training_max_df),
            str(self.training_use_lemmer)
        ]).replace('.', '')

    def set_lda_model(self, lda_model):
        self.lda_model = lda_model

    #####################
    # Model computation
    #####################

    def compute_lda_model(self, texts):
        """
        Compute the lda model
        :return:
        """
        if self.training_corpus is None:
            self.compute_corpus(texts, parameters='training')

        if self.training_corpus is None or len(self.training_corpus) == 0:
            raise Exception(
                'The training corpus is empty. Tune model computation parameters.'
            )

        start = time.time()

        if self.passes == 2:
            passes = 10 if (len(self.training_corpus) /
                            self.chunksize) < 10 else 2
        else:
            passes = self.passes

        id2word = {k: v for k, v in enumerate(self.training_features_names)}

        self.lda_model = LdaModel(
            self.training_corpus,
            id2word=id2word,
            num_topics=self.training_number_of_topics_to_extract,
            eval_every=1,
            passes=passes,
            chunksize=self.chunksize)
        end = time.time()

        self.model_computation_time = end - start

    def save_model_to_file(self, file_path):
        """

        :type file_path: str
        :param file_path: the path of the models file
        :return:
        """
        if self.lda_model is None:
            logging.error('The model has not been computed yet.')
            return False
        else:
            self.lda_model.save(file_path)

    def load_model_from_file(self, input_filepath):
        """

        :param input_folder:
        :return:
        """
        self.lda_model = LdaModel.load(input_filepath)

    def compute_corpus(self, texts, parameters='training'):
        """
        Compute the corpus in gensim format considering the specified set of parameters 'training' or 'analysis'.
        :param parameters:
        :param texts:
        :return:
        """
        if parameters == 'training':
            tf_matrix, tf_matrix_features_names, tf_matrix_docs_ids = self.compute_tf_matrix(
                texts, parameters)

            if tf_matrix_features_names is None or len(
                    tf_matrix_features_names) == 0:
                return []

            self.training_corpus = matutils.Sparse2Corpus(
                tf_matrix, documents_columns=False)
            self.training_features_names = tf_matrix_features_names
            self.training_documents = tf_matrix_docs_ids
            return self.training_corpus
        elif parameters == 'analysis':
            if self.lda_model is None:
                logging.error('The model has not been computed yet.')
                return None
            else:
                # Note: words not included in the model are ignored
                tf_matrix, tf_matrix_features_names, tf_matrix_docs_ids = self.compute_tf_matrix(
                    texts, parameters)

                if len(tf_matrix_features_names) == 0:
                    return []

                corpus = [None] * tf_matrix.shape[0]

                if len(tf_matrix_features_names) != 0:
                    word2id = {
                        self.lda_model.id2word[id]: id
                        for id in self.lda_model.id2word.keys()
                    }

                    for i in range(tf_matrix.shape[0]):
                        doc = tf_matrix.getrow(i)
                        _, cols = doc.nonzero()

                        corpus[i] = [None] * len(cols)
                        count = 0
                        for col in cols:
                            if tf_matrix_features_names[col] in word2id.keys():
                                corpus[i][count] = (int(
                                    word2id[tf_matrix_features_names[col]]),
                                                    int(tf_matrix[i, col]))
                                count += 1

                        corpus[i] = corpus[i][:count]

                self.analysis_corpus = corpus
                self.analysis_features_names = tf_matrix_features_names
                self.analysis_documents = tf_matrix_docs_ids

                return self.analysis_corpus
        else:
            logging.error(
                "Value not allowed for argument parameters. Allowed values are 'training' or 'analysis'."
            )
            return None

    def compute_corpus_single_query(self, text):
        """
        Compute the corpus in gensim format for a single query (this implies using special parameters for preprocessing)
        :param text:
        :return:
        """

        if self.lda_model is None:
            logging.error('The model has not been computed or loaded yet.')
            return None, None
        else:
            # Note: words not included in the model are ignored
            stopwords_list = lda_utils.get_stopwords(self.language)
            tf_matrix, tf_matrix_features_names = lda_utils.compute_tf(
                [text], stopwords_list, self.language, True, 1, 1.0)

            if len(tf_matrix_features_names) == 0:
                return [], tf_matrix_features_names

            corpus = [None] * tf_matrix.shape[0]

            if len(tf_matrix_features_names) != 0:
                word2id = {
                    self.lda_model.id2word[id]: id
                    for id in self.lda_model.id2word.keys()
                }

                for i in range(tf_matrix.shape[0]):
                    doc = tf_matrix.getrow(i)
                    _, cols = doc.nonzero()

                    corpus[i] = [None] * len(cols)
                    count = 0
                    for col in cols:
                        if tf_matrix_features_names[col] in word2id.keys():
                            corpus[i][count] = (int(
                                word2id[tf_matrix_features_names[col]]),
                                                int(tf_matrix[i, col]))
                            count += 1

                    corpus[i] = corpus[i][:count]

            return corpus, tf_matrix_features_names

    def compute_tf_matrix(self, texts, parameters='training'):
        """
        Compute the tf matrix using the specified set of parameters ('training' or 'analysis').
        If texts is not specified the system tries to retrieve data directly from the associated db.
        :param parameters: 'training' or 'analysis'
        :param texts: list of strings representing texts to transform.
        :return:
        """

        tf_matrix_docs_id = None
        if parameters == 'training' or parameters == 'analysis':

            stopwords_list = lda_utils.get_stopwords(self.language)

            if parameters == 'training':
                use_lemmer = self.training_use_lemmer
                min_df = self.training_min_df
                max_df = self.training_max_df
            else:
                use_lemmer = self.analysis_use_lemmer
                min_df = self.analysis_min_df
                max_df = self.analysis_max_df

            tf_matrix, tf_matrix_features_names = lda_utils.compute_tf(
                texts, stopwords_list, self.language, use_lemmer, min_df,
                max_df)
        else:
            logging.error(
                "Value not allowed for argument parameters. Allowed values are 'training' or 'analysis'."
            )
            return None

        return tf_matrix, tf_matrix_features_names, tf_matrix_docs_id

    def compute_topic_assignment(self, texts):
        """
        Computes the topics assignment for each document w.r.t the specified topic_model

        Example of output = [[(25, 0.1174058544855012), (49, 0.82926081218116554)],
                            [(6, 0.29928250617927882), (49, 0.59405082715405444)]]

        :param texts:
        :return:
        """
        corpus = self.compute_corpus(texts, parameters='analysis')

        if len(corpus) == 0:
            raise Exception(
                'The corpus is empty. Tune analysis parameters and check stopwords.'
            )

        computed_assignment = self.lda_model[corpus]
        if texts is not None:
            # is the corpus related to analysis parameters
            self.topic_assignment = computed_assignment

        return computed_assignment

    def compute_topic_assignment_for_query(self, text):
        corpus, _ = self.compute_corpus_single_query(text)

        if corpus is None or len(corpus) == 0:
            raise Exception(
                'The corpus is empty. Tune analysis parameters and check stopwords.'
            )

        computed_assignment = self.lda_model[corpus]

        return computed_assignment

    #######################
    # Print functions
    #######################

    def print_topic_assignment(self, topic_assignment):
        """
        Print a topic assignment in a human readable format
        :param topic_assignment:
        :return:
        """
        print('\tTopic importance\tTopic description')
        for i, doc in enumerate(topic_assignment):
            print('Document {0}'.format(i))
            for a in doc:
                print()
                string_topic = a[
                    0] if self.lda_model is None else self.lda_model.print_topic(
                        a[0])
                print('\t{1:2f}\t\t{0}'.format(string_topic, a[1]))

    def print_all_topics(self,
                         num_topics=10,
                         num_words=20,
                         try_to_disambiguate=False,
                         min_word_probabity_for_disambiguation=0.010):
        """
        Print topics from a given LdaModel
        """
        print('Print {0} topics'.format(num_topics))
        print('------------')
        for t in self.lda_model.show_topics(num_topics=num_topics,
                                            num_words=num_words,
                                            formatted=False):
            if try_to_disambiguate:
                possible_labels = self.__class__.label_topic_by_probability(
                    self.lda_model.show_topic(t[0]),
                    min_word_probability=min_word_probabity_for_disambiguation
                )[:2]
                print('{0}:\t{1}\n'.format(t[0], possible_labels))
                print('{0}\n'.format(t[1]))
            else:
                print('{0}:\t{1}\n'.format(t[0], t[1]))

    def get_topic_description(self, topic_id, num_words=20):
        """
        Print topics from a given LdaModel
        """
        if self.lda_model is None:
            logging.error('The model has not been computed yet.')
        else:
            return self.lda_model.show_topic(topic_id, num_words)

    #######################
    # Labeling functions
    #######################

    def compute_topic_labels(self,
                             labeling_mode='mixed',
                             min_word_probability=0.01,
                             max_number_of_words_per_query=6,
                             n_words_to_label=3):
        """
        The labeling is performed querying wikipedia with a set of representative words for the topic.
        The words are chosen with the parameter
        labeling_mode:
        - 'based_on_probability': considers all words with a weight (probability) greater than 0.010
        - 'based_on_top_words': considers the 3 most probable words for the topic
        - 'mixed': try with 'based_on_probability', if there are no results try with 'based_on_top_words'
        """

        if self.lda_model is None:
            logging.error('No LDA model loaded.')

        n_labels_to_save = 3
        self.topic_labels = {}

        # label topics
        for t in self.lda_model.show_topics(
                num_topics=self.training_number_of_topics_to_extract,
                num_words=40,
                formatted=False):
            topic_id = t[0]

            possible_labels = []
            if labeling_mode == 'mixed' or labeling_mode == 'based_on_probability':
                possible_labels = self.__class__.label_topic_by_probability(
                    self.lda_model.show_topic(topic_id),
                    min_word_probability=min_word_probability,
                    max_words=max_number_of_words_per_query)[:n_labels_to_save]

            if len(possible_labels) == 0:
                # try to disambiguate by n_words
                possible_labels = self.__class__.label_topic_by_number_of_words(
                    self.lda_model.show_topic(topic_id),
                    n_words=n_words_to_label)[:n_labels_to_save]

            for i in range(len(possible_labels), n_labels_to_save):
                # fill empty labels
                possible_labels.append('')

            self.topic_labels[topic_id] = possible_labels
            time.sleep(0.5)

    def get_topic_labels(self):
        if self.topic_labels is None:
            self.compute_topic_labels()

        return self.topic_labels

    def get_all_topics(self):
        """
        Return a dictionary where keys are topic ids (integers) and values are words distributions.
        Words distribution should be a dictionary where keys are words and values are words weights within the topic
        :rtype: dict
        :return:
        """

        topics = {}

        for t in self.lda_model.show_topics(
                num_topics=self.training_number_of_topics_to_extract,
                num_words=config.max_number_of_words_per_topic,
                formatted=False):
            topic_id = t[0]
            topic_distr = self.get_word_frequencies(
                self.lda_model.show_topic(
                    topic_id, config.max_number_of_words_per_topic))

            topics[topic_id] = topic_distr

        return topics

    def _get_words_distribution(self, topic_id):
        """
        Return a a dictionary where keys are words and values are words weights within the topic

        :param topic_id: the topic index
        :rtype: dict
        :return:
        """
        topic_description = self.lda_model.show_topic(
            topic_id, config.max_number_of_words_per_topic)
        return self.__class__.get_word_frequencies(topic_description)

    @classmethod
    def delete_model_files(cls, folder_path, files_prefix):
        """
        Delete all files related to a model that have the specified file prefix
        :param folder_path:
        :param files_prefix:
        :rtype:
        :return: 200 if all files have been removed, 404 if files does not exist
        """
        if os.path.exists(os.path.join(folder_path, files_prefix)):
            files_to_remove = [
                files_prefix,
                files_prefix + ".state",
                files_prefix + ".expElogbeta.npy",
                files_prefix + ".id2word",
            ]

            for f in files_to_remove:
                os.remove(os.path.join(folder_path, f))

            return 200
        else:
            logging.error('[ERROR] Model files does not exists.')
            return 404

    #######################
    # Topic labeling
    #######################

    @classmethod
    def label_topic_by_probability(cls,
                                   topic_description,
                                   min_word_probability=0.010,
                                   max_words=6):
        """
        Try to disambiguate a topic considering all words with a weight greater than min_word_probability
        :param max_words:
        :param topic_description: is a list of pairs  (word, word_probability)
        :param min_word_probability: is the minimum probability for words
        :return: list of strings, possible wikipedia pages
        """
        words = [w for w, p in topic_description if p >= min_word_probability]
        words = words[:max_words]

        if len(words) == 0:
            # if no words are over the threshold return empty
            res = []
        else:
            res = wikipedia.search(' '.join(words))

        return res

    @classmethod
    def label_topic_by_number_of_words(cls, topic_description, n_words=5):
        """
        Try to disambiguate a topic considering top k words in its description
        :param n_words:
        :param topic_description: is a list of pairs  (word, word_probability)
        :return: list of strings, possible wikipedia pages
        """
        words = [t[0] for i, t in enumerate(topic_description) if i < n_words]

        if len(words) == 0:
            # if no words are over the threshold, take the first
            words = [topic_description[0][0]]

        res = wikipedia.search(' '.join(words))
        return res

    @classmethod
    def get_word_frequencies(cls, topic_description):
        """
        Given a topic description, returns the corresponding dictionary with words as keys
        and frequencies (weight * 1000) as values.
        :param topic_description: list of pairs (word, word_weight)
        :return:
        """
        frequencies = {w: f for w, f in topic_description}
        return frequencies
Beispiel #21
0
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return topic_list, coherence_values


# Code starts here
topic_list, coherence_value_list = compute_coherence_values(
    dictionary=dictionary,
    corpus=doc_term_matrix,
    texts=doc_clean,
    start=1,
    limit=41,
    step=5)
print(coherence_value_list)

max_index = coherence_value_list.index(max(coherence_value_list))

opt_topic = topic_list[max_index]

print("Optimum no of topics: ", opt_topic)

lda_model = LdaModel(corpus=doc_term_matrix,
                     num_topics=opt_topic,
                     id2word=dictionary,
                     iterations=10,
                     passes=30,
                     random_state=0)

lda_model.print_topic(5)
Beispiel #22
0
                        'r',
                        encoding='utf8').readlines()
stopwords = [w.strip() for w in stopwords]
fp = codecs.open('../../corpus/test.lsnp', 'r', encoding='utf8')
for line in fp:
    line = line.split()
    train.append([w for w in line if w not in stopwords])
print(train)
dictionary = Dictionary(train)
corpus = [dictionary.doc2bow(text) for text in train]
print(corpus[0])
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)
# 打印前20个topic的词分布
print(lda.print_topics(20))
# 打印id为20的topic的词分布
print(lda.print_topic(20))
#模型的保存/ 加载
lda.save('zhwiki_lda.model')
lda = models.ldamodel.LdaModel.load('zhwiki_lda.model')

# tt = 'loss of energy , motivation and no interest in work anymore - be it time to through it all in'
#
# test_doc = list(i for i in tt.split())
#
# doc_bow = id2word.doc2bow(test_doc)      #文档转换成bow
# doc_lda = lda[doc_bow]                   #得到新文档的主题分布
# #输出新文档的主题分布
# print doc_lda
# for topic in doc_lda:
#     print "%s\t%f\n"%(lda.print_topic(topic[0]), topic[1])
Beispiel #23
0
def train_lda(period='_18_09',
              num_topics=4,
              remove_top=0,
              tfidf=False,
              passes=40,
              iterations=600,
              eval_every=None):
    set_path()
    create_dictionary(dictionary_path, content_18_09_path, content_08_98_path)
    dictionary = corpora.Dictionary.load(dictionary_path)
    create_corpus(dictionary, eval('corpus_path' + period),
                  eval('content' + period + '_path'), remove_top)
    corpus = corpora.MmCorpus(eval('corpus_path' + period))
    temp = dictionary[0]
    id2word = dictionary.id2token

    if tfidf:
        corpusTfidf = convert_tfidf(eval('corpus_path' + period))
        model = LdaModel(corpus=corpusTfidf, id2word=id2word, \
                   alpha='auto', eta='auto', \
                   iterations=iterations, num_topics=num_topics, \
                   passes=passes, eval_every=eval_every)
    else:
        #建立模型
        model = LdaModel(corpus=corpus, id2word=id2word, \
                           alpha='auto', eta='auto', \
                           iterations=iterations, num_topics=num_topics, \
                           passes=passes, eval_every=eval_every)

    #序化模型
    lda_model_file = open(eval('model_path' + period), 'wb')
    pkl.dump(model, lda_model_file)
    lda_model_file.close()

    #输出每个主题的关键词
    top_topics = model.top_topics(corpus)
    print('每个主题的关键词:')
    pprint(top_topics)

    #每一行包含了主题词和主题词的权重
    print('前两个主题的主题词和主题词权重:')
    model.print_topic(0, 10)
    model.print_topic(1, 10)

    #给训练集输出其属于不同主题概率
    print('输出前十本杂志属于不同主题的概率:')
    for i in list(range(10)):
        for index, score in sorted(model[corpus[i]],
                                   key=lambda tup: -1 * tup[1]):
            print(index, score)

    #calculate perplexity
    testset = []
    for i in range(corpus.num_docs):
        testset.append(corpus[i])
    perplexity(model, testset, dictionary, len(dictionary.keys()), num_topics)

    #LDA visualization---------------------------------------------------
    vis_wrapper = pyLDAvis.gensim.prepare(model, corpus, dictionary)
    pyLDAvis.display(vis_wrapper)
    pyLDAvis.save_html(vis_wrapper, "lda%dtopics.html" % num_topics)
    pyLDAvis.show(vis_wrapper)
Beispiel #24
0
#600 è un parametro arbitrario, ho scelto di provare per questo per iniziare dato
#che sono state trovate circa 700 parole diverse e vorrei cercare di mantenerne
#il più possibile
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=600)

#------------------------------------------ FASE 2 - RAPPRESENTAZIONE BoW ---------------------------------------------------------------

# Converto ogni documento nella sua rappresentazione Bag of Words sfruttando il dizionario creato in precedenza
bow_corpus = [dictionary.doc2bow(doc) for doc in Corpora]

#------------------------------------------ FASE 3 - TRAINING DI LDA  -------------------------------------------------------------------

# Alleno il modello LDA
lda = LdaModel(bow_corpus, id2word=dictionary, passes=92, num_topics=28)

# Printo i topic con le rispettive parole più significative appartenenti per ogni topic
for idx, topic in lda.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

#------------------------------------------ FASE X - ESEMPIO DI CLASSIFICAZIONE  ----------------------------------------------------------

# Faccio vedere un esempio dove prendo un documento e tramite LDA vedo in quale topic ricade
# In base allo Score più alto che ottengo.
print("\ndocuemnto che sto cercando di classificare")
print(Corpora[1])
print("\n")

for index, score in sorted(lda[bow_corpus[1]], key=lambda tup: -1 * tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda.print_topic(index,
                                                                    10)))
Beispiel #25
0
#1. 读取和处理数据
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=10)

#2. 将文本转化为词袋模型
from gensim.corpora import Dictionary
dct = Dictionary(["máma mele maso".split(), "ema má máma".split()])
dct.doc2bow(["this", "is", "máma"])
[(2, 1)]
dct.doc2bow(["this", "is", "máma"], return_missing=True)
([(2, 1)], {u'this': 1, u'is': 1})

#3. 运用LDA模型
from gensim.models import LdaModel
lda = LdaModel(common_corpus, num_topics=10)
lda.print_topic(1, topn=2)
# '0.500*"9" + 0.045*"10"
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('robCSVcorpus.mm', corpus) # store to disk, for later use
#import numpy as np
#corpusnp=np.array(corpus)
#print len(corpusnp), len(np.delete(corpusnp, 1, axis=0))
#Initialize the transformation
#term freq inverse doc freq
#trying to ind the frequency on that page versus overall frequency

lda = LdaModel(corpus, id2word=dictionary,num_topics=numTopics)

ii=0

print 'These are the topics'
for i in range(0, lda.num_topics):
    print lda.print_topic(i,topn=20)


        
#sys.exit()
doc_lda = []
for i in range(len(corpus)):
    doc_lda.append(lda[corpus[i]])
#print(doc_lda)

'''
This will simply put the tuples in a csv file, poor format
'''
with open('CorpusTopicsOld.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile)
    for i in range(len(corpus)):
Beispiel #27
0
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

    #输出每个主题的关键词
    top_topics = model.top_topics(corpus)
    pprint(top_topics)

    #序化模型
    lda_model_file = open(path_model_pkl, 'wb')
    pkl.dump(model, lda_model_file)
    lda_model_file.close()
    model = read_pkl(path_ldamodel)

    #每一行包含了主题词和主题词的权重
    model.print_topic(0, 10)
    model.print_topic(1, 10)

    #判断一个训练集文档属于哪个主题
    for index, score in sorted(model[corpus[0]], key=lambda tup: -1 * tup[1]):
        print("Score: {}\t Topic: {}".format(score,
                                             model.print_topic(index, 10)))

    #给训练集输出其属于不同主题概率
    for i in list(range(10)):
        for index, score in sorted(model[corpus[i]],
                                   key=lambda tup: -1 * tup[1]):
            print(index, score)

    #LDA visualization---------------------------------------------------
class LDAModeling(BaseModel):

	def __init__(self, training_data, num_topics, alpha=0.01, passes=20):
		super().__init__(training_data)
		self.lda_model = None
		self.num_topics = num_topics
		self._corpus = []
		self._dictionary = []
		self._passes = passes
		self._alpha = alpha

		# self.tokenization()

	def build_lda_model(self):		
		self._dictionary = corpora.Dictionary(self._documents)
		self._dictionary.compactify()	# assign new word ids to all words. This is done to make the ids more compact
		self._corpus = [self._dictionary.doc2bow(doc) for doc in self._documents]
		self.lda_model = LdaModel(corpus=self._corpus, id2word=self._dictionary, num_topics=self.num_topics, alpha=self._alpha, passes=self._passes, minimum_probability=0)

	def set_topics(self, text_parser, emo_only_index):
		print("[*] Setting topic for each utterance...")
		emo_topics = []
		emo_list = [e[0] for e in text_parser.emotes]
		for i in range(len(text_parser.utterances)):
			if emo_only_index[i] == 1:
				text_parser.utterances[i].append(self.num_topics)
				for word in text_parser.utterances[i][0].split():
					if word.lower() in emo_list:
						emo_topics.append((word.lower(), 0))
			else:
				topic = self.query_topic(text_parser.utterances[i][0])	# topic: 0 ~ topic_num-1
				text_parser.utterances[i].append(topic)
		
		emo_topics = list(set(emo_topics))
		topics_dict = self._get_topics_and_distribution()
		topics_dict[self.num_topics] = emo_topics

		return topics_dict

	def query_topic(self, query): 
		# Similarity Queries
		query = self._dictionary.doc2bow(query.lower().split())
		topic, probability = list(sorted(self.lda_model[query], key=lambda x: x[1]))[-1]
		return topic

	def _get_topics_and_distribution(self):
		topics = {}
		for i in range(self.num_topics):
			s = self.lda_model.print_topic(i, topn=10)
			topics[i] = []
			for t in s.split('+'):
				topics[i].append((t.strip().split('*')[1], float(t.strip().split('*')[0])))
		return topics

	def print_topic(self, topic_no, top_n=5):
		if self.lda_model:
			self.lda_model.print_topic(topic_no, top_n)

	def save_topics(self, filename, threshold, topics_dict):
		with open(filename, "w") as f:
			for i in range(self.num_topics):
				result = ""
				for t_d in topics_dict[i]:
					if t_d[1] >= threshold:
						result += t_d[0] + " "
				f.write(result.rstrip()+"\n")
			f.write(" ".join([e[0] for e in topics_dict[self.num_topics]]))
Beispiel #29
0
] for document in documents]

all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens)
                  if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda = LdaModel(corpus, id2word=dictionary, num_topics=10)

#print lda.print_topics(num_words=100,num_topics=10)

for i in range(0, lda.num_topics):
    print lda.print_topic(i)
    string_topic = lda.print_topic(i)
    topic_words = re.findall(r'\b[a-z]+\b', string_topic)
    word_score = re.findall(r'0.\d+', string_topic)
    tp_number = i + 1
    index = 0
    while index < len(topic_words):
        cursor.execute(
            "insert into lda_topic(word,score,topic_number) values (?,?,?)", (
                topic_words[index],
                word_score[index],
                tp_number,
            ))
        index = index + 1

db.commit()