def lda2topicMap(lda, corpus, ids, name): print '>> generating topic map...' evidenceTopicMap = {} # dictionary = gensim.corpora.Dictionary.load('/tmp/' + name + '.dict') i = 0 for c in corpus: # b = dictionary.doc2bow(d) evidenceTopicMap[ids[i]] = lda.get_document_topics(c, minimum_probability=0.01) i += 1 print len(evidenceTopicMap) return evidenceTopicMap
def get_document_topics(doc, name): lda = gensim.models.ldamodel.LdaModel.load(name + '.lda') englishStopWords = get_stopwords('english', name) text = [singularize(word) for word in doc.lower().split() if singularize(word) not in englishStopWords and word.isalpha() and len(word) > 1] dictionary = gensim.corpora.Dictionary.load(name + '.dict') document_topics = lda.get_document_topics(dictionary.doc2bow(text), minimum_probability=0.05) if len(document_topics) > 0: primary_topic_tuple = max(document_topics, key=lambda x:x[1]) topic_terms = lda.show_topic(primary_topic_tuple[0]) print topic_terms return document_topics, topic_terms else: return [], ''
def gensim_lda(pd_df_yelp, text_rev): #gensim lda common_dict = Dictionary(text_rev) common_corpus = [common_dict.doc2bow(text) for text in text_rev] lda = LdaModel(common_corpus) topics = [lda.get_document_topics(doc) for doc in common_corpus] topicIDs = [topic[0][0] for topic in topics] topic_prob_list = [lda.show_topic(topicID) for topicID in topicIDs] topic_prob_list_split = [zip(*item) for item in topic_prob_list] topic_prob_list_words = [list(map(lambda topID: dict(common_dict)[int(topID)],item[0]))\ for item in topic_prob_list_split] topic_prob_list_prob = list( map(lambda item: list(item[1]), topic_prob_list_split)) return (topic_prob_list_words, topic_prob_list_prob)
def topic_extraction(corpus, ntopics): # gensim lda common_dictionary = Dictionary(corpus) common_corpus = [common_dictionary.doc2bow(text) for text in corpus] lda = LdaModel(common_corpus, num_topics=ntopics, iterations=800, random_state=1) features = lda.get_document_topics(common_corpus, minimum_probability=0) lda_list = [] for f in features: lda_list.append([b[1] for b in f]) lda_df = pd.DataFrame(lda_list) lda_df = lda_df.reset_index(drop=True) return lda_df