def lda(self, column, method='mallet', save_model=None, load_model=None): if method == 'mallet': print("Mallet LDA") else: raise ValueError("Invalid paramater for LDA.method: {}".format(method)) tmp_dir = os.path.join(tempfile.gettempdir(), "mallet_lda/") if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) if not hasattr(self, "vocab"): self.__learn_vocab(column) if len(self.__bag_of_words) != 0: docs, id2word = self.__bag_of_words[column] else: docs, id2word = self.__get_bag_of_words(column) model = LdaMallet(mallet_path=self.mallet_path, id2word=id2word, prefix=tmp_dir, num_topics=self.num_topics, iterations=self.lda_max_iter, optimize_interval=20) model.train(docs) doc_topics = list() for doc_vec in model.read_doctopics(model.fdoctopics()): topic_ids, vecs = zip(*doc_vec) doc_topics.append(np.array(vecs)) self.features["lda"] = np.array(doc_topics) self.feature_names["lda"] = model.get_topics() return
def fit_lda(prefix, tokenized_docs, id2word, mallet_path=os.environ["MALLET_PATH"], num_topics=500, iterations=500): if not os.path.isdir(prefix): os.makedirs(prefix) if os.path.exists(os.path.join(prefix, "saved_model.pkl")): return utils.SaveLoad.load(os.path.join(prefix, "saved_model.pkl")) elif tokenized_docs is None: raise ValueError("LDA model not found at {}/{}".format(prefixed, "saved_model.pkl")) if mallet_path is None or mallet_path == "": raise ValueError("No mallet path specified") corpus = [id2word.doc2bow(tokens) for tokens in tokenized_docs.values.tolist()] lda_model = LdaMallet(mallet_path=mallet_path, prefix=prefix, corpus=corpus, id2word=id2word, iterations=iterations, workers=4, num_topics=num_topics, optimize_interval=20) lda_model.save(os.path.join(prefix, "saved_model.pkl")) id2word.save_as_text(os.path.join(prefix, "id2word")) # save clean lda weights for later analysis W = lda_model.get_topics() W = pd.DataFrame(W).rename(columns=id2word) W.index = pd.Series(["lda.{}".format(i) for i in range(len(W))], name="topic_id") W.to_csv(os.path.join(prefix, "lda_weights.csv")) return lda_model
class LdaMalletHandler(TransformerMixin, BaseEstimator): def __init__(self, n_components=100, mallet_path=None, prefix=None, iterations=1000, vectorizer=None): self.n_components = n_components self.mallet_path = mallet_path self.prefix = prefix self.iterations = iterations self.vectorizer = vectorizer def vect2gensim(self, vectorizer, dtmatrix): # transform sparse matrix into gensim corpus and dictionary corpus_vect_gensim = Sparse2Corpus(dtmatrix, documents_columns=False) dictionary = Dictionary.from_corpus( corpus_vect_gensim, id2word=dict( (id, word) for word, id in vectorizer.vocabulary_.items())) return (corpus_vect_gensim, dictionary) def fit(self, X, y=None): print('vect2gensim') corpus, dictionary = self.vect2gensim(self.vectorizer, X) self.model = LdaMallet(self.mallet_path, iterations=self.iterations, corpus=corpus, num_topics=self.n_components, id2word=dictionary) return self def transform(self, X): corpus = Sparse2Corpus(X, documents_columns=False) doc_topic = self.model[corpus] mat = np.zeros((X.shape[0], self.n_components), dtype=np.float64) for did, doc in enumerate(doc_topic): for topic in doc: mat[did][topic[0]] = topic[1] return mat def get_doc_topic_matrix(self): arr = [] lines = open(self.model.fdoctopics(), "r").read().splitlines() for line in lines: arr.append(line.split()[2:]) return np.array(arr, dtype=np.float64) def get_topic_words_matrix(self): return self.model.get_topics()
def main(): num_topics = 10 #doc_topics_path='C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\10_3_doctopics.txt' MALLET_PATH = os.path.join("D:\Mallet", "mallet-2.0.8", "bin", "mallet.bat") # r"D:\Mallet\mallet-2.0.8\bin" texts = wenzhang_Lemmatizer1.texts2 dictionary = corpora.Dictionary(texts) dictionary.save('dictionary_mallet_10_3.dictionary') #dictionary = corpora.Dictionary.load('dictionary_mallet_10_3.dictionary') word_id = dictionary.token2id corpus = [dictionary.doc2bow(text) for text in texts] # corpora.MmCorpus.serialize('corpus_mallet_10_3.mm', corpus) # 保存corpus # corpus = corpora.MmCorpus('corpus_wenzhang.mm') # 加载 # print(os.path.abspath('corpus.mm')) mallet_lda_model = LdaMallet(mallet_path=MALLET_PATH, corpus=corpus, num_topics=num_topics, id2word=dictionary) mallet_lda_model.save( 'C:\\Users\\asus\\Desktop\\测试\\model\\mallet_lda_model_10_3.model') #mallet_lda_model = LdaMallet.load('C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\mallet_lda_model_10_3.model') topic_words20 = mallet_lda_model.show_topics(num_topics=num_topics, num_words=20) # print(topic_words20) writetopic_wordToExcleFile( topic_words20, 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\topic_words20_10_3.xls' ) topic_words = mallet_lda_model.get_topics() print(len(topic_words), len(topic_words[0])) doc_topics = txt_to_numpy(mallet_lda_model.fdoctopics()) #doc_topics_path #print(mallet_lda_model.fdoctopics()) writedoc_topicToExcleFile( doc_topics, 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\doc_topics20_10_3' ) return texts, word_id, topic_words, doc_topics, num_topics