Ejemplo n.º 1
0
def ldamodel(doc_clean,n_topics,n_words,description,tfidfmodel=False,unseen_docs=None):
    doc_clean = [min_char(doc).split() for doc in doc_clean]

    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    corpus = [dictionary.doc2bow(doc) for doc in doc_clean]
    compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=doc_clean, start=2, limit=40, step=6)
    if tfidfmodel:
       tfidf = TfidfModel(corpus,id2word=dictionary,smartirs='ntc')
       corpus = tfidf[corpus]

    ldamodel = LdaModel(corpus, num_topics=16, id2word=dictionary,random_state=1,passes=50,per_word_topics=True)
    print("#Tópicos LDA")
    for i in range(0, n_topics):
        temp = ldamodel.show_topic(i, n_words)
        terms = []
        for term in temp:
            terms.append(term)
        print("Topic #" + str(i) + ": ", ", ".join([t + '*' + str(i) for t, i in terms]))
    print('Bound: ',ldamodel.bound(corpus))
    # Compute Perplexity
    print('Perplexity: ',ldamodel.log_perplexity(corpus))
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
    if unseen_docs:
        corpus_new = [dictionary.doc2bow(doc) for doc in unseen_docs]
        for i, unseen_doc in enumerate(corpus_new):
            topic = None
            score = 0
            inference_doc = ldamodel[unseen_doc]
            print(unseen_docs[i])
            for index,tmpScore in inference_doc[0]:
                if tmpScore > score:
                    score = tmpScore
                    topic = ldamodel.print_topic(index, 5)
            print ("Score: {}\t Topic: {}".format(score, topic))
        print("Log perplexity for new corpus is", ldamodel.log_perplexity(corpus_new))

    print_result(ldamodel, doc_clean, corpus, n_topics, description)
    pickle.dump(corpus, open(description+'.pkl', 'wb'))
    dictionary.save(description+'dictionary.gensim')
    ldamodel.save(description+'_ldamodel.gensim')
def getPageID2TopicDist(docs, n_topics):
    """
	:docs=[(docID,[word1,word2,..]),...]
	:param n_topics: Number of topics to generate from the data
	:return: [docid]=[0.9,0.1] # sequential topic probability
	"""
    # prepare the corpus in Gensim format
    texts = [doc[1] for doc in docs]

    dictionary = corpora.Dictionary(texts)

    corpus = [dictionary.doc2bow(text) for text in texts]
    #print corpus;
    # apply LDA on the corpus
    lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics, passes=4)

    # print top terms from each topic
    for i in range(lda.num_topics):
        topic = lda.print_topic(i, topn=8)
        print[tup[1] for tup in lda.show_topic(topicid=i, topn=8)]

    # get the topic distribution in each document
    pageID2TopicDist = dict()
    doc_topic = []
    for index, doc in enumerate(docs):

        doc_bow = dictionary.doc2bow(doc[1])
        topic_distri = lda[doc_bow]
        dlist = [0.0] * n_topics
        for tup in topic_distri:
            dlist[tup[0]] = tup[1]

        #print topic_distri;
        #top_topic = sorted(topic_distri, key=lambda x: x[1], reverse=True)[0][0]
        #doc_topic.append(top_topic)

        pageid = doc[0]
        pageID2TopicDist[pageid] = dlist

    # topic of each document
    #print doc_topic;
    return pageID2TopicDist
Ejemplo n.º 3
0
dictionary = Dictionary(texts)

dictionary.filter_extremes(no_above = 0.2)


corpus = [dictionary.doc2bow(text) for text in texts]

print('done with corpus.')

lda = LdaModel(corpus, id2word=dictionary, iterations=100, num_topics=10)

#print(lda.print_topics())
for i in range(0,num_topics):
    print("-----------------------------------")
    print(lda.print_topic(i))

#lda.save('lda')
topic = []

for i in range (0,num_topics):
    topic.append(open("./data/topic_" + str(i)+".txt", "w", encoding='utf-8'))
    topic[i].write(lda.print_topic(i) + '\n')
topic.append(open("./data/topic_"+str(num_topics)+".txt", "w", encoding='utf-8'))
topic[num_topics].write("empty topic" + '\n')
#print(dictionary[corpus[1][0][0]] + '-------------')


flag = False

origin = open("./data/output.txt").readlines()
Ejemplo n.º 4
0
#  gensim
##################
from gensim import corpora, similarities
from gensim.models.ldamodel import LdaModel
from gensim.models.tfidfmodel import TfidfModel

# gensim Dictionary format is "list of list" (i.e [[1, 2, 3], [4,5,6]...]
dictionary = corpora.Dictionary(contents_clean)

# words to vectors
corpus = [dictionary.doc2bow(text) for text in contents_clean]

lda = LdaModel(corpus = corpus, num_topics=10, id2word=dictionary)


print lda.print_topic(1, 10)
print lda.print_topic(2, 10)
print lda.print_topic(3, 10)
print '--------------------------'
for topic in lda.print_topics(num_topics=10, num_words=5):
    print topic[1]

print '------------tfidf---------'
tfidf = TfidfModel(corpus=corpus)
print tfidf[dictionary.doc2bow([u'中国', u'报道'])]





Ejemplo n.º 5
0
# construct LDA model
model_lda = LdaModel(corpus=corpus,
                     id2word=id_to_word,
                     num_topics=25, 
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

# print keywords in n topics
pprint(model_lda.print_topics())

# print top 10 keywords that comprise topic with index of 0
pprint(model_lda.print_topic(24))
# the most import keywords, and the respective weight, that form topic 0 are

# print top 10 keywords that comprise topic with index of 1
pprint(model_lda.print_topic(1))

# TODO (Lee) - infer topic from keywords?


# #### Alternate workflow to Evaluate - model #1

# In[ ]:


#uncomment to use
Ejemplo n.º 6
0
neg_segments = []
for each_review in neg_review_list:
    neg_segments.append(' '.join(jieba.cut(each_review)))
#delete stop words
for i in range(0,len(neg_segments)):
    for should_delete_word in stop_words:
        if(should_delete_word in neg_segments[i]):
            neg_segments[i] = neg_segments[i].replace(should_delete_word,"")
        else:
            pass
# fed data into work2vec
with open("neg_reviews_segmented.txt", 'w') as file:
    for seg_review in neg_segments:
        file.write(seg_review)
#fed into model

pos_dict = corpora.Dictionary([word.split() for word in pos_segments])
pos_corpus = [pos_dict.doc2bow(i) for i in [word.split() for word in pos_segments]]
pos_lda_model = LdaModel(pos_corpus,num_topics=10,id2word=pos_dict)
print("positive topics:")
for i in range(10):
    print(pos_lda_model.print_topic(i))
print("__________________________")
##negative
print("negative topics:")
neg_dict = corpora.Dictionary([word.split() for word in neg_segments])
neg_corpus = [neg_dict.doc2bow(i) for i in [word.split() for word in neg_segments]]
neg_lda_model = LdaModel(neg_corpus,num_topics=10,id2word=neg_dict)
for i in range(10):
    print(neg_lda_model.print_topic(i))
print("__________________________")
Ejemplo n.º 7
0
    df = pd.read_csv("./total_info.txt",
                     sep=',',
                     header=0,
                     names=['A', 'B', 'C', 'D', 'E', 'F'])
    data = df['B']
    data = data.apply(lambda s: clean_text(s))
    datalist = data.values
    print(datalist)
    # 分词
    texts = [[word for word in doc.lower().split()] for doc in datalist]
    print(texts[0])

    common_dictionary = Dictionary(texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in texts]
    lda = LdaModel(common_corpus, id2word=common_dictionary, num_topics=20)
    print(lda.print_topic(10, topn=5))

    lda.save('lda.model')
    lda = LdaModel.load('lda.model')

    tryTxt = "while i be suffer i be able to press and go in subscribe but when i press the video it keep show no connection ."
    trylist = [word for word in tryTxt.lower().split()]
    bow = common_dictionary.doc2bow(trylist)
    print(lda.get_document_topics(bow))

    import pyLDAvis.gensim
    # 浏览器打开http://127.0.0.1:8888/
    vis = pyLDAvis.gensim.prepare(lda, common_corpus, common_dictionary)
    pyLDAvis.show(vis)
Ejemplo n.º 8
0
    # Build LDA model
    print('Training LDA model...')
    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=options.num_topics,
        random_state=100,
        update_every=1,
        chunksize=100,
        passes=options.iterations,
        alpha='auto',
        per_word_topics=True
    )
    print('...done')

    print('Saving model...')
    model.save(model_path)
    print('...done')

    print('Topics found:')
    for i in range(options.num_topics):
        print(i, ' -> ', model.print_topic(i))
    doc_lda = model[corpus]

    # Compute Perplexity
    print('Perplexity: ', model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: ', coherence_lda)
# words_list = []
# for i in tweets[list(tweets.keys())[2]].split("|||"):
#     words =[word for word in nltk.word_tokenize(i) if word not in STOPWORDS and word.isalnum() and len(word)>=2]
#     words_list.append(words)

num_topics = 3
dictionary = corpora.Dictionary(words_list)
corpus = [dictionary.doc2bow(words) for words in words_list]
lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics)

###output1: topics and corresponding words
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(lda.print_topics(num_words=10))

###output2: 2 ways of showing one topic and corresponding words
lda.print_topic(topicno=0)
lda.show_topic(1)

### ouput3: show topic of one user (even new user)
sorted(lda.get_document_topics(corpus[100],
                               minimum_probability=0,
                               per_word_topics=False),
       key=lambda x: x[1],
       reverse=True)

### output4: visualize LDA
lda_display = pyLDAvis.gensim.prepare(lda,
                                      corpus,
                                      dictionary,
                                      R=15,
                                      sort_topics=False)
Ejemplo n.º 10
0
from gensim import corpora
dictionary = corpora.Dictionary([a[1] for a in articles])
corpus = [dictionary.doc2bow(a[1]) for a in articles]

print(corpus[0])

from gensim.models.ldamodel import LdaModel
nr_topics = 5
ldamodel = LdaModel(corpus,
                    num_topics=nr_topics,
                    id2word=dictionary,
                    passes=20)

print(ldamodel.print_topics())

# Show topics by top-3 terms
for t in range(nr_topics):
    print(ldamodel.print_topic(t, topn=3))

# Show some random articles
from random import shuffle
idx = list(range(len(articles)))
shuffle(idx)
for a in idx[:3]:
    article = articles[a]
    print('==========================')
    print(article[0])
    prediction = ldamodel[corpus[a]][0]
    print(ldamodel.print_topic(prediction[0], topn=3))
    print('Probability:', prediction[1])