def create_online_lda(docs, ids, name, numTopics): corpus, dictionary = docs2corpus(docs, name, True) print '>> generating online lda model...' lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=numTopics, id2word=dictionary, passes=10) print lda lda.save(name + '.lda') return lda2topicMap(lda, corpus, ids, name), lda.show_topics(formatted=False)
def generateTopics(corpus, dictionary): # Build LDA model using the above corpus #lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=100) #corpus_lda = lda[corpus] lda = lsimodel.LsiModel(corpus, id2word=dictionary, num_topics=100) corpus_lda = lda[corpus] # Group topics with similar words together. tops = set(lda.show_topics(num_topics=100,num_words=20)) top_clusters = [] for l in tops: top = [] for t in l.split(" + "): top.append((t.split("*")[0], t.split("*")[1])) top_clusters.append(top) # Generate word only topics top_wordonly = [] for i in top_clusters: top_wordonly.append(":".join([j[1] for j in i])) return lda, corpus_lda, top_clusters, top_wordonly
''' lda = models.LdaModel.load('model.lda') # applying the LDA model to identify topic for each request using # similarity queries docs = request_text_list lda_topics = [] for doc in docs: vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lda = lda[vec_bow] vec_lda.sort(key=lambda item: -item[1]) lda_topics.append(vec_lda[0][0]) # printing the topics and the words associated with each topic for i in lda.show_topics(): print i # generating a feature for the topics df['lda_topics'] = pd.Series(lda_topics) # generating dummies for each topic topics = pd.get_dummies(df['lda_topics'], prefix='topic') df = pd.concat([df, topics], axis=1) ''' Logistic Regression ''' import statsmodels.formula.api as smf from sklearn.cross_validation import train_test_split
def get_online_lda_topics(name, numTopics): lda = gensim.models.ldamodel.LdaModel.load(name + '.lda') return lda.show_topics(num_topics=numTopics, formatted=False)
def load_online_lda(docs, ids, name): print '>> loading online lda model...' corpus, dictionary = docs2corpus(docs, name, False) lda = gensim.models.ldamodel.LdaModel.load(name + '.lda') # return a map from evidence to topic and a list of topics return lda2topicMap(lda, corpus, ids, name), lda.show_topics(formatted=False)
word_len = pd.Series(clean_all).map(len) word_len = pd.Series(word_len, name='word_len') cut_len = pd.concat([cut_result, word_len], axis=1) group_by_len = cut_len.groupby('word_len') len_count = group_by_len.count() group_by_cut = pd.Series(freq) sort_freq = group_by_cut.sort_values(ascending=False) appear_once = freq.hapaxes() #统计出现一次的词语 #统计每天词频最高的词语 hot_day = [] for i in range(len(clean_day)): freq = FreqDist(clean_day[i]) group_by_word = pd.Series(freq) sort_freq = group_by_word.sort_values(ascending=False) hot_day.append(sort_freq[:10]) hot_day_stat = pd.DataFrame(hot_day) hot_day_stat.to_excel("D:\\data\\HotWords5.xlsx") #构建LDA模型 word_dict = corpora.Dictionary(clean_day) corpus_list = [word_dict.doc2bow(doc) for doc in clean_day] lda = models.ldamodel.LdaModel(corpus=corpus_list, id2word=word_dict, num_topics=10) output_file = 'D:\\data\\lda_output.txt' with open(output_file, 'w') as f: for pattern in lda.show_topics(): f.write("%s" % str(pattern))
vec_rp.sort(key=lambda item: -item[1]) rp_topics.append(vec_rp[0][0]) df['rp_topics'] = pd.Series(rp_topics) # applying the LDA model to identify topic for each request using # similarity queries docs = request_text_list lda_topics = [] for doc in docs: vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lda = lda[vec_bow] vec_lda.sort(key=lambda item: -item[1]) lda_topics.append(vec_lda[0][0]) for i in lda.show_topics(): print i for i in lda.print_topics(): print i df['lda_topics'] = pd.Series(lda_topics) # applying the HDP model to identify topic for each request using # similarity queries docs = request_text_list[:30] hdp_topics = [] for doc in docs: vec_bow = dictionary.doc2bow(doc.lower().split()) vec_hdp = hdp[vec_bow] vec_hdp.sort(key=lambda item: -item[1])