def build_model(dictionary, corpus, n_topics, lemmatized_notes): # Build LDA model coh_val_lda = [] coh_val_lda_mallet = [] model_lda = [] model_mallet = [] for topic in n_topics: lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=topic, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) coh_lda_model = CoherenceModel(model=model_lda, texts=lemmatized_notes, dictionary=dictionary, coherence='c_v') coh_val_lda.append(coh_lda_model.get_coherence()) model_lda.append(lda_model) # Build LDA Mallet model mallet_path = 'mallet/bin/mallet' lda_mallet = LdaMallet(mallet_path, corpus=corpus, num_topics=n_topics, id2word=dictionary) coh_lda_model = CoherenceModel(model=lda_mallet, texts=lemmatized_notes, dictionary=dictionary, coherence='c_v') model_mallet.append(lda_mallet) coh_val_lda_mallet.append(coh_lda_model.get_coherence()) return model_mallet, coh_val_lda_mallet, model_lda, coh_val_lda
def topic_count_selection(dictionary: Dictionary, corpus: list, tokenized_docs: list, test_range: tuple) -> tuple: """ Function to measure LDA topic coherence for different numbers of topics Returns: ------- lm_list : List of LDA topic models c_v : Coherence values corresponding to the LDA model with respective number of topics """ c_v = [] lm_list = [] for num_topics in range(test_range[0], test_range[1]): lm = LdaMallet('/home/hadoop/Mallet-master/bin/mallet', corpus=corpus, num_topics=num_topics, id2word=dictionary, iterations=1000, prefix=f'{os.getcwd()}/models/MALLET/', random_seed=42) lm_list.append(lm) cm = CoherenceModel(model=lm, texts=tokenized_docs, dictionary=dictionary, coherence='c_v') c_v.append(cm.get_coherence()) return lm_list, c_v
def gensim_mallet_lda(self, num_topics=5, num_words=15): """Performs Mallet LDA using Gensim wrapper. Requires gensim_corpus output for a column from gensim_preprocessing(). Args: num_topics (int): Desired number of topics to model. num_words (int): Number of words to print for each topic. """ mallet_lda_model = LdaMallet(self.mallet_path, corpus=self.gensim_corpus, num_topics=num_topics, id2word=self.id2word) label = self.data_frame.columns.to_numpy()[self.col_num] print(f"Column {self.col_num} - Label: {label}\n") print(f"MALLET LDA Topic Modeling via Gensim with {num_topics} topics:\n") # Print topics and words x = mallet_lda_model.show_topics(num_topics=num_topics, num_words=num_words, log=False, formatted=False) topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x] for topic, words in topics_words: print(f"Topic {str(topic)}:\n{str(words)}\n") coherence = self.coherence_score(mallet_lda_model, self.gensim_words_nostops, self.id2word) print(f"Coherence: {coherence}")
def lda(self, column, method='mallet', save_model=None, load_model=None): if method == 'mallet': print("Mallet LDA") else: raise ValueError("Invalid paramater for LDA.method: {}".format(method)) tmp_dir = os.path.join(tempfile.gettempdir(), "mallet_lda/") if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) if not hasattr(self, "vocab"): self.__learn_vocab(column) if len(self.__bag_of_words) != 0: docs, id2word = self.__bag_of_words[column] else: docs, id2word = self.__get_bag_of_words(column) model = LdaMallet(mallet_path=self.mallet_path, id2word=id2word, prefix=tmp_dir, num_topics=self.num_topics, iterations=self.lda_max_iter, optimize_interval=20) model.train(docs) doc_topics = list() for doc_vec in model.read_doctopics(model.fdoctopics()): topic_ids, vecs = zip(*doc_vec) doc_topics.append(np.array(vecs)) self.features["lda"] = np.array(doc_topics) self.feature_names["lda"] = model.get_topics() return
def lda(filename): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) path_to_mallet_binary = "/home/xiu-xiu/Mallet/bin/mallet" tweets = [] with open(filename, newline='') as csvfile: reader = csv.DictReader(csvfile) for tweet in reader: tweets.append(tweet['text'].split(' ')) dictionary = Dictionary(tweets) corpus = [dictionary.doc2bow(tweet) for tweet in tweets] for num_topics in [20, 30, 50]: lines = [] model = LdaMallet(path_to_mallet_binary, corpus=corpus, num_topics=num_topics, id2word=dictionary) with open('lda' + str(num_topics) + '.txt', 'w') as result: for topic in range(num_topics): for word in model.show_topic(topic, topn=20): result.write(word[0] + ' ') result.write('\n')
def instanciate_model(self, num_topics, passes, iterations, enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=False): if enable_mallet is True: # Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip os.environ.update({'MALLET_HOME': r'C:/mallet-2.0.8/'}) self.mallet_path = 'C:\\mallet-2.0.8\\bin\\mallet' # update this path self.lda_model = LdaMallet(self.mallet_path, corpus=self.corpus, num_topics=num_topics, id2word=self.id2word, iterations=iterations, optimize_interval=optimize_interval, topic_threshold=topic_threshold) print('Mallet LDA model built\n') if show_topics_on_creation is True: pprint(self.lda_model.show_topics(formatted=False)) else: self.lda_model = LdaMulticore(corpus=self.corpus, id2word=self.id2word, num_topics=num_topics, random_state=100, chunksize=500, passes=passes, iterations=iterations, per_word_topics=True) print('LDA_MultiCore model built\n') if show_topics_on_creation is True: pprint(self.lda_model.print_topics())
def topic_modelling(data_object_name): """ perform topic modelign for a given set of posts (data object) :param data_object_name: raw data for topic modeling """ data_words = Serialization.load_obj(data_object_name) stop_words = stopwords.words('english') print('removing stopwords and unfrequent words...') ranks = Serialization.load_obj('dict.ranks') data_words = Utils.remove_noncontent_words(data_words, stop_words, ranks) id2word = corpora.Dictionary(data_words) corpus = [id2word.doc2bow(post) for post in data_words] topics = CS_TOPICS print('performing topic modeling with', topics, 'topics') ldamodel = LdaMallet(mallet_path, corpus=corpus, num_topics=topics, id2word=id2word) pprint( malletmodel2ldamodel(ldamodel).top_topics(corpus, data_words, id2word)) '''
def setUp(self): # Suppose given below are the topics which two different LdaModels come up with. # `topics1` is clearly better as it has a clear distinction between system-human # interaction and graphs. Hence both the coherence measures for `topics1` should be # greater. self.topics1 = [['human', 'computer', 'system', 'interface'], ['graph', 'minors', 'trees', 'eps']] self.topics2 = [['user', 'graph', 'minors', 'system'], ['time', 'graph', 'survey', 'minors']] self.ldamodel = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=0, iterations=0) mallet_home = os.environ.get('MALLET_HOME', None) self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None if self.mallet_path: self.malletmodel = LdaMallet(mallet_path=self.mallet_path, corpus=corpus, id2word=dictionary, num_topics=2, iterations=0) vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None) if not vw_path: msg = "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model" logging.info(msg) self.vw_path = None else: self.vw_path = vw_path self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=corpus, id2word=dictionary, num_topics=2, passes=0)
def fit_lda(prefix, tokenized_docs, id2word, mallet_path=os.environ["MALLET_PATH"], num_topics=500, iterations=500): if not os.path.isdir(prefix): os.makedirs(prefix) if os.path.exists(os.path.join(prefix, "saved_model.pkl")): return utils.SaveLoad.load(os.path.join(prefix, "saved_model.pkl")) elif tokenized_docs is None: raise ValueError("LDA model not found at {}/{}".format(prefixed, "saved_model.pkl")) if mallet_path is None or mallet_path == "": raise ValueError("No mallet path specified") corpus = [id2word.doc2bow(tokens) for tokens in tokenized_docs.values.tolist()] lda_model = LdaMallet(mallet_path=mallet_path, prefix=prefix, corpus=corpus, id2word=id2word, iterations=iterations, workers=4, num_topics=num_topics, optimize_interval=20) lda_model.save(os.path.join(prefix, "saved_model.pkl")) id2word.save_as_text(os.path.join(prefix, "id2word")) # save clean lda weights for later analysis W = lda_model.get_topics() W = pd.DataFrame(W).rename(columns=id2word) W.index = pd.Series(["lda.{}".format(i) for i in range(len(W))], name="topic_id") W.to_csv(os.path.join(prefix, "lda_weights.csv")) return lda_model
def learn_lda_model(self, corpus, dictionary, k, iterations=100): """ learning LDA model :param corpus: corpus created by gensim :param dictionary: dictionary created by gensim :param k: number of topics :param iterations: number of iterations :return: """ if not self.use_mallet: lda = LdaMulticore(corpus, id2word=dictionary, workers=self.cpu_count, num_topics=k, random_state=42, iterations=iterations, per_word_topics=False, eval_every=None) else: lda = LdaMallet(self.path_to_mallet_binary, corpus=corpus, id2word=dictionary, workers=self.cpu_count, num_topics=k, random_seed=42, iterations=iterations, optimize_interval=10) cm = CoherenceModel(model=lda, corpus=corpus, coherence='u_mass') coherence = cm.get_coherence() print('{}: {}'.format(k, coherence)) return coherence, lda
def create_mallet_lda_model(self, no_topics, random_state=42, workers=None, mallet_path="mallet-2.0.8/bin/mallet", iterations=1000, custom_prefix=None): """ Method to create a mallet lda model using gensim wrapper for lda mallet :param no_topics: Number of topics for lda model :param random_state: Random state to be able to reprocude model creation :param workers: Number of workers to use :param mallet_path: path to mallet binary, e.g. "mallet-2.0.8/bin/mallet" :param iterations: iterations over the corpus?! """ if workers is None: workers = self.processes if self.bag_of_words is None: self.create_bag_of_words() if custom_prefix is None: prefix = f"{self.path}mallet_temp_" else: prefix = f"{self.path}mallet_temp_{custom_prefix}_" self.lda_model = LdaMallet(num_topics=no_topics, mallet_path=mallet_path, corpus=self.bag_of_words, id2word=self.id2word, random_seed=random_state, iterations=iterations, workers=workers, prefix=prefix)
def run_lda(self, processed_sentences): lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() # list containing the final topic keywords topic_top_words = [] documents = [ comment.split() for comment in processed_sentences if comment ] dictionary = corpora.Dictionary(documents) # Filter the words that occur in less than 5 comments or those that occur in more than half of the comments dictionary.filter_extremes(no_below=5, no_above=0.5) doc_term_matrix = [dictionary.doc2bow(doc) for doc in documents] mallet_path = 'C:\\Mallet-2.0.8\\bin\\mallet' optimization_interval = 50 lda_alpha = 1 lda = LdaMallet(mallet_path, doc_term_matrix, num_topics=self.number_of_topics, id2word=dictionary, optimize_interval=optimization_interval, alpha=lda_alpha) # This list contains the word probabilities given a topic topic_words_and_probs = [] for i in range(self.number_of_topics): # Get top number_of_lda_keywords_for_assignment words and corresponding probabilities for the topic topic_words_and_probs.append( lda.show_topic( i, topn=self.number_of_lda_keywords_for_assignment)) for i in range(len(topic_words_and_probs)): temp = [] for j in topic_words_and_probs[i]: if j[1] > 0.0: temp.append(j) self.total_topic_word.append(j[0]) topic_words_and_probs[i] = temp for i in range(self.number_of_topics): # Get the top keywords for the topic and extract the top nouns topic_words = [ component[0] for component in topic_words_and_probs[i] ] final_topic_words = [] for word in topic_words: if len(final_topic_words) >= self.number_of_lda_keywords: break pos = nltk.pos_tag([word]) word = lemmatizer.lemmatize(word) noun_tags = ['NN', 'NNS', 'NP', 'NPS'] if word not in final_topic_words and pos[0][1] in noun_tags: final_topic_words.append(word) topic_top_words.append(final_topic_words) return topic_top_words, topic_words_and_probs
def get_lda_mallet_model(doc_term_matrix, id2word, fname): mallet_path = '../../model/mallet-2.0.8/bin/mallet' if fname is not None: try: LdaMallet(fname) except: pass lda_mallet = LdaMallet(mallet_path=mallet_path, corpus=doc_term_matrix, id2word=id2word, num_topics=10) _save_model(lda_mallet, fname=fname) return lda_mallet
def main(): print("\n-----LDA CONCEPT DETECITON-----") corpus = load_from_csv(CORPUS_PATH) # Create CountVectorizer to get Document-Term matrix stop_words = load_stop_words("data/stopwords-fr.txt") vectorizer = CountVectorizer(lowercase=True, max_df=MAX_DF, min_df=MIN_DF, token_pattern=r"(?u)\b\w\w\w+\b") proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer) proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only] proc_stop_words = [] for i in range(len(proc_corpus_text_only)): proc_stop_words.append([]) for j in range(len(proc_corpus_text_only[i])): if proc_corpus_text_only[i][j] not in stop_words and len( proc_corpus_text_only[i][j]) >= 3: proc_stop_words[i].append(proc_corpus_text_only[i][j]) # train vectorizer on corpus id2word = Dictionary(proc_stop_words) corp = [id2word.doc2bow(text) for text in proc_stop_words] # print("Number of Features: " + str(len(feature_names))) # initialize model path_to_mallet_binary = "Mallet/bin/mallet" mallet_model = LdaMallet(path_to_mallet_binary, corpus=corp, num_topics=14, id2word=id2word, optimize_interval=1, random_seed=9, iterations=5) doc_topics = list( mallet_model.read_doctopics(mallet_model.fdoctopics(), renorm=False)) topic_word = TopicWord(mallet_model) topic_word.get_topic_word() topic_word.write_to_csv("../output/topic_" + str(mallet_model.random_seed) + "_" + str(mallet_model.iterations) + "_" + str(mallet_model.num_topics) + ".csv") topic_doc = TopicDoc(mallet_model) topic_doc.get_topic_doc() topic_doc.write_to_csv("output/topic_doc" + str(mallet_model.random_seed) + "_" + str(mallet_model.iterations) + "_" + str(mallet_model.num_topics) + ".csv", num_docs=50) return 0
def fit(self, X, y=None): print('vect2gensim') corpus, dictionary = self.vect2gensim(self.vectorizer, X) self.model = LdaMallet(self.mallet_path, iterations=self.iterations, corpus=corpus, num_topics=self.n_components, id2word=dictionary) return self
def LDA(dictionary, corpus, k_topics, iterations): print("Iniciando LDA...") model = LdaMallet(os.path.dirname(os.path.abspath(__file__)) + '/mallet-2.0.8/bin/mallet', corpus=corpus, num_topics=k_topics, id2word=dictionary, iterations=iterations) return model
def mallet_lda(self, num): id2word = corpora.Dictionary(self.data['token']) texts = self.data['token'] corpus = [id2word.doc2bow(text) for text in texts] os.environ['Mallet_HOME'] = 'C:\\Mallet' mallet_path = 'C:\\Mallet\\bin\\mallet' ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=num, id2word=id2word) return ldamallet.print_topics(num, num_words=6)
def run(): # Get the Preprocessed Dataset df = pd.read_pickle('./data/tmp/preprocessed.pkl') if os.path.isfile('./models/MALLET/mallet_model.pkl'): # Let's not do any model retraining without building in topic stability constraints # e.g. number of docs or tokens now in different topics seen = False # Data we provide is new and unseen for the model with open('./models/MALLET/mallet_model.pkl', 'rb') as modelfile: topic_model = pickle.load(modelfile) with open('./models/MALLET/mallet_dict.pkl', 'rb') as dictfile: dictionary = pickle.load(dictfile) df['bow'] = df['tokens'].apply(dictionary.doc2bow) else: seen = True # any data we provide is used to train the model with Timer('Train the LDA Model'): test_range = (5, 50) df, corpus, dictionary = get_corpus_and_dict(df, 'tokens') list_of_models, scores = topic_count_selection( dictionary, corpus, list(df['tokens']), test_range) plot_coherence( test_range, scores).savefig('./models/MALLET/ModelCoherence.png') # Let's save the model with highest coherence num_topics = test_range[0] + scores.index(max(scores)) + 1 topic_model = LdaMallet('/home/hadoop/Mallet-master/bin/mallet', corpus=corpus, num_topics=num_topics, id2word=dictionary, iterations=1000, prefix=f'{os.getcwd()}/models/MALLET/', random_seed=42) print(f"* Chosen Model with {num_topics} topics") with open('./models/MALLET/mallet_model.pkl', 'wb') as modelfile: topic_model.save(modelfile) with open('./models/MALLET/mallet_corpus.pkl', 'wb') as corpusfile: pickle.dump(corpus, corpusfile) with open('./models/MALLET/mallet_dict.pkl', 'wb') as dictfile: pickle.dump(dictionary, dictfile) df = get_topic_model_scores(df, topic_model, seen=seen) df.to_pickle('./data/tmp/scored.pkl') print("\nSample") print(df.head(), "\n")
def find_best_number_of_topics(data): dictionary = Dictionary(data) corpus = [dictionary.doc2bow(text) for text in data] scores = dict() for topics in range(2, 10, 1): print('performing topic modeling with', topics, 'topics') ldamodel = LdaMallet(TopicModeling.MALLET_PATH, corpus=corpus, num_topics=topics, id2word=dictionary) coherence_model = CoherenceModel(model=ldamodel, texts=data, coherence='c_v') coherence = coherence_model.get_coherence() scores[topics] = coherence # end for print('coherence scores: the higher, the better:', scores)
def get_lda_mallet_model(doc_term_matrix, id2word, fname): mallet_path = '../model/mallet/bin/mallet' if params['training']: lda_mallet = LdaMallet(mallet_path=mallet_path, corpus=doc_term_matrix, id2word=id2word, workers=6, num_topics=params['num_topics']) _save_model('mallet', lda_mallet, fname=fname) else: lda_mallet = _load_model('mallet', fname) return lda_mallet
def model_topic(data_words, topics): """ return topics model given data and number of topics :param data_words: data for topic modeling (e.g., a set of posts) :param topics: number of desired topics :return: topic model """ id2word = corpora.Dictionary(data_words) corpus = [id2word.doc2bow(post) for post in data_words] print('performing topic modeling with', topics, 'topics') return LdaMallet(mallet_path, corpus=corpus, num_topics=topics, id2word=id2word)
def train(self, train_filename): print("train LDA") train_name = os.path.basename(train_filename) model_filename = train_name + ".lda_model" if os.path.isfile(model_filename): self.model = LdaMallet.load(model_filename) else: self.corpus = preprocessing.GensimCorpus(train_filename) self.model = LdaMallet(mallet_path, self.corpus, num_topics=100, id2word=self.corpus.dictionary) self.model.save(model_filename) topics_str = self.model.show_topics(num_topics=-1) open(train_name + ".lda_model.topics", 'w').write(str(topics_str))
def train_model(num_topics, documents): # documents = get_dictionary() dictionary = corpora.Dictionary(documents) max_tokens = len(dictionary.keys()) # print(f'Num tokens before cleanup {len(dictionary.keys())}') dictionary.filter_extremes(no_below=10, no_above=0.7, keep_n=max_tokens) # print(f'Num tokens after cleanup {len(dictionary.keys())}') corpus_bow = [dictionary.doc2bow(doc) for doc in documents] mallet_model = LdaMallet(mallet_path=MALLET_BINARY_PATH, corpus=corpus_bow, id2word=dictionary, num_topics=num_topics) lda_model = ldamallet.malletmodel2ldamodel(mallet_model) return lda_model, corpus_bow, dictionary
def LdaModel(self, num_topics, corpus, dictionary): """Create a LDA topic model Input: num_topics: number of topics for the model corpus: gensim corpus ditionary: gensim dictionary Output: lda_model: a topic model using Latent Dirichlet Allocation (LDA) """ lda_model = LdaMallet(mallet_path=self.path_to_mallet_bin, num_topics=num_topics, corpus=corpus, id2word=dictionary, random_seed=123) return lda_model
def compute_coherence_values(dnary, corpus, texts, limit, start=2, step=1): coherence_values = [] model_list = [] for topics in range(start, limit, step): model = LdaMallet(mallet_path, corpus=corpus, id2word=dnary, num_topics=topics, workers=3) model_list.append(model) coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dnary, coherence='c_v') coherence_values.append(coherence_model.get_coherence()) return model_list, coherence_values
def lda(bow, df, vocab): # Generate and load corpus corpus = text_to_corpus(bow) corpus = np.load('corpus.npy') path_to_mallet = './mallet-2.0.8/bin/mallet' model = LdaMallet(path_to_mallet, corpus=corpus, num_topics=5, workers=4, id2word=vocab) res = model.print_topics(num_topics=-1, num_words=50) # print response for x in res: print(x) for x in model[corpus]: print(x)
def set_model(self, lang: str, data_version: int, dictionary_version: float, model_version: str, param_name: str, param_version: int, model_file_path: str, language_processed_data: list): my_path = os.path.abspath(os.path.dirname(__file__)) logging.info("---- Creating LDA Mallet model") logging.info("------ Getting LDA Mallet model file") mallet_path = os.path.join(my_path, "../../statics/mallet-2.0.8/bin/mallet") temp = self.essentials.dictionary[0] model = LdaMallet(mallet_path, corpus=self.essentials.corpus, num_topics=self.number_of_topics, id2word=self.essentials.dictionary.id2token) model.save(model_file_path) self.model = model logging.info("---- LDA Mallet model is created") metrics = self.get_model_evaluation_metrics(language_processed_data) parameters = self.get_model_parameters() self.write_model_evaluation_metrics(lang, data_version, dictionary_version, model_version,param_name, param_version, metrics, parameters) return
def get_LDA_mallet_model(paths, num_topics, iterations, minimum_probability): with open(paths[1], 'rb') as f: corpus = pickle.load(f) # sparse terms (sparse matrix form of corpus) dictionary = Dictionary.load(paths[9]) temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token model = LdaMallet( mallet_path=os.getenv('MALLET_BIN'), corpus=corpus, num_topics=num_topics, prefix=f'{paths[16]}{num_topics}', id2word=id2word, workers=3, iterations=iterations, # topic_threshold=minimum_probability ) return model
def get_lda_model(corpus, id2word, model_type, num_topics, mallet_path): if model_type == 'lda': lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=100, update_every=1, chunksize=100, passes=4, alpha='auto', per_word_topics=True) elif model_type == 'mallet': lda_model = LdaMallet(mallet_path, corpus=corpus, id2word=id2word, num_topics=num_topics) else: raise ValueError( 'Unknown model type. Available types: \'lda\', \'mallet\'') return lda_model
def train_lda_mallet(corpus, id2word, num_topics, params: dict): mallet_path = params.get('mallet_path', MALLET_PATH) prefix_path = params.get('prefix_path', str( Path(ARTEFACTS_PATH) / PREFIX_BASE_PATH)) prefix = params.get('prefix', '') if prefix: prefix = prefix + '_' prefix = str(Path(prefix_path) / prefix) iterations = params.get('iterations', ITERATIONS) alpha = params.get('alpha', 50) random_state = params.get('random_state', RANDOM_STATE) return LdaMallet(mallet_path=mallet_path, prefix=prefix, corpus=corpus, id2word=id2word, num_topics=num_topics, alpha=alpha, iterations=iterations, random_seed=random_state)