def lda(self, column, method='mallet', save_model=None, load_model=None): if method == 'mallet': print("Mallet LDA") else: raise ValueError("Invalid paramater for LDA.method: {}".format(method)) tmp_dir = os.path.join(tempfile.gettempdir(), "mallet_lda/") if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) if not hasattr(self, "vocab"): self.__learn_vocab(column) if len(self.__bag_of_words) != 0: docs, id2word = self.__bag_of_words[column] else: docs, id2word = self.__get_bag_of_words(column) model = LdaMallet(mallet_path=self.mallet_path, id2word=id2word, prefix=tmp_dir, num_topics=self.num_topics, iterations=self.lda_max_iter, optimize_interval=20) model.train(docs) doc_topics = list() for doc_vec in model.read_doctopics(model.fdoctopics()): topic_ids, vecs = zip(*doc_vec) doc_topics.append(np.array(vecs)) self.features["lda"] = np.array(doc_topics) self.feature_names["lda"] = model.get_topics() return
def main(): print("\n-----LDA CONCEPT DETECITON-----") corpus = load_from_csv(CORPUS_PATH) # Create CountVectorizer to get Document-Term matrix stop_words = load_stop_words("data/stopwords-fr.txt") vectorizer = CountVectorizer(lowercase=True, max_df=MAX_DF, min_df=MIN_DF, token_pattern=r"(?u)\b\w\w\w+\b") proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer) proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only] proc_stop_words = [] for i in range(len(proc_corpus_text_only)): proc_stop_words.append([]) for j in range(len(proc_corpus_text_only[i])): if proc_corpus_text_only[i][j] not in stop_words and len( proc_corpus_text_only[i][j]) >= 3: proc_stop_words[i].append(proc_corpus_text_only[i][j]) # train vectorizer on corpus id2word = Dictionary(proc_stop_words) corp = [id2word.doc2bow(text) for text in proc_stop_words] # print("Number of Features: " + str(len(feature_names))) # initialize model path_to_mallet_binary = "Mallet/bin/mallet" mallet_model = LdaMallet(path_to_mallet_binary, corpus=corp, num_topics=14, id2word=id2word, optimize_interval=1, random_seed=9, iterations=5) doc_topics = list( mallet_model.read_doctopics(mallet_model.fdoctopics(), renorm=False)) topic_word = TopicWord(mallet_model) topic_word.get_topic_word() topic_word.write_to_csv("../output/topic_" + str(mallet_model.random_seed) + "_" + str(mallet_model.iterations) + "_" + str(mallet_model.num_topics) + ".csv") topic_doc = TopicDoc(mallet_model) topic_doc.get_topic_doc() topic_doc.write_to_csv("output/topic_doc" + str(mallet_model.random_seed) + "_" + str(mallet_model.iterations) + "_" + str(mallet_model.num_topics) + ".csv", num_docs=50) return 0