def topic_model_coherence_generator(texts, start_topic_count, end_topic_count, step): models = [] coherence_scores = [] for topic_nums in tqdm(range(start_topic_count, end_topic_count + 1, step)): dictionary_path = 'dictionary_mallet122_' + str( topic_nums) + '.dictionary' dictionary = corpora.Dictionary.load(dictionary_path) corpus = [dictionary.doc2bow(text) for text in texts] mallet_path = 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\122文章\\mallet模型\\dictionary_mallet122_' + str( topic_nums) + '.model' mallet_lda_model = LdaMallet.load(mallet_path) cv_coherence_model_mallet_lda = CoherenceModel(model=mallet_lda_model, corpus=corpus, texts=texts, dictionary=dictionary, coherence='c_v') coherence_score = cv_coherence_model_mallet_lda.get_coherence() coherence_scores.append(coherence_score) return coherence_scores
def calculate_entropy_mallet_models(): # output to csv files. with open(corpus_path, 'rb') as f: corpus = pickle.load(f) index = 0 dataset = pandas.read_csv(dataset_csv_path) for model_path in models_path: lda_model = LdaMallet.load(model_path) lda_model = models.wrappers.ldamallet.malletmodel2ldamodel( lda_model, iterations=iteration) df = pd.DataFrame() pbar = tqdm.tqdm(total=len(lda_model[corpus])) for i, row in enumerate(lda_model[corpus]): topic_dist = sorted(row, key=lambda x: (x[1]), reverse=True) rs_string = '' topic_entropy = 0 for topic in topic_dist: rs_string = rs_string + 'Topic ' + str(topic[0] + 1) + ': ' + str( topic[1]) + '; ' topic_entropy = topic_entropy + (-math.log2(topic[1])) df = df.append(pd.Series([ str(i), dataset['Submission_Num'][i], rs_string, str(topic_entropy), dataset['Submission_Text'][i] ]), ignore_index=True) pbar.update(1) df.columns = [ 'Document_No', 'Submission_Num', 'Probabilities', 'Entropy', 'Submission_Text' ] csv_file_result_path = f'./turn-in/{bigram_threshold}/model_entropy/{num_topics[index]}.csv' index = index + 1 create_file(csv_file_result_path) df.to_csv(csv_file_result_path, index=False) pbar.close()
def generate_topic_weight_terms(): with open(corpus_path, 'rb') as f: corpus = pickle.load(f) pbar = tqdm.tqdm(total=len(models_path)) i = 0 for model_path in models_path: lda_model = LdaMallet.load(model_path) lda_model = models.wrappers.ldamallet.malletmodel2ldamodel( lda_model, iterations=iteration) df = topics_proportion(lda_model=lda_model, corpus=corpus, num_topics=num_topics[i]) df.sort_values(by=['Topic'], ascending=True, inplace=True) # Sort columns in ascending order output_path = f'./turn-in/{bigram_threshold}/topic_proportion_terms/{num_topics[i]}.csv' create_file(output_path) df.to_csv(output_path, index=False) pbar.update(1) i = i + 1 pbar.close()
def format_topics_sentences_mallet(ldamodel:LdaMallet, corpus, texts): # Init output sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row in enumerate(ldamodel[corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num, topn=8) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append( pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True) else: break sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'] # Add original text to the end of the output contents = pd.Series(texts) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) return (sent_topics_df)
def setUp(self): # Suppose given below are the topics which two different LdaModels come up with. # `topics1` is clearly better as it has a clear distinction between system-human # interaction and graphs. Hence both the coherence measures for `topics1` should be # greater. self.topics1 = [['human', 'computer', 'system', 'interface'], ['graph', 'minors', 'trees', 'eps']] self.topics2 = [['user', 'graph', 'minors', 'system'], ['time', 'graph', 'survey', 'minors']] self.ldamodel = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=2, passes=0, iterations=0) mallet_home = os.environ.get('MALLET_HOME', None) self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None if self.mallet_path: self.malletmodel = LdaMallet(mallet_path=self.mallet_path, corpus=self.corpus, id2word=self.dictionary, num_topics=2, iterations=0) vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None) if not vw_path: logging.info( "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model" ) self.vw_path = None else: self.vw_path = vw_path self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=self.corpus, id2word=self.dictionary, num_topics=2, passes=0)
def build_lda_model(dictionary, corpus, lda_params, use_mallet=True): num_topics, alpha, beta = lda_params if (use_mallet): mallet_path = 'C:/mallet/mallet-2.0.8/bin/mallet.bat' lda_model = LdaMallet(mallet_path, corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha=alpha) else: lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=33, update_every=1, chunksize=100, passes=10, alpha=alpha, eta=beta, per_word_topics=True) return lda_model
def _make_mallet_model(self, corpus_filepath, path_to_mallet, remove_stopwords, corpus_language, num_topics, **kwargs): """Returns a gensim-created topic model (class LdaMallet), and assigns class attributes _docs (an OrderedDict containing the preprocessed corpus documents) and _vocabulary (the corpus vocabulary (iter of str)). This function lowercases all words in the corpus, and removes stopwords if remove_stopwords is True. The keys for the document dictionary are unique document ids of the format "doc<i>" where <i> is the number of the document in the corpus.""" munged_corpus = munge.corpus_to_doc_tokens(corpus_filepath) # make corpus lowercase, remove stopwords if remove_stopwords: stop_words = stopwords.words(corpus_language) prepped_corpus = [ [word.lower() for word in doc if word.lower() not in stop_words] for doc in munged_corpus] else: prepped_corpus = [[word.lower() for word in doc] for doc in munged_corpus] # TODO (7/12/19 faunam): make lowercasing corpus optional id_to_word = corpora.Dictionary(prepped_corpus) term_document_frequency = [ id_to_word.doc2bow(doc) for doc in prepped_corpus] mallet_model = LdaMallet(path_to_mallet, corpus=term_document_frequency, id2word=id_to_word, num_topics=num_topics, **kwargs) docs = OrderedDict(("doc" + str(i), " ".join(doc)) for i, doc in enumerate(prepped_corpus)) full_corpus = munge.corpus_to_documents(corpus_filepath) full_docs = OrderedDict(("doc" + str(i), doc) for i, doc in enumerate(full_corpus)) self._docs = docs self._full_docs = full_docs self._vocabulary = [word for word in id_to_word.values()] return mallet_model
def main(): num_topics = 10 #doc_topics_path='C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\10_3_doctopics.txt' MALLET_PATH = os.path.join("D:\Mallet", "mallet-2.0.8", "bin", "mallet.bat") # r"D:\Mallet\mallet-2.0.8\bin" texts = wenzhang_Lemmatizer1.texts2 dictionary = corpora.Dictionary(texts) dictionary.save('dictionary_mallet_10_3.dictionary') #dictionary = corpora.Dictionary.load('dictionary_mallet_10_3.dictionary') word_id = dictionary.token2id corpus = [dictionary.doc2bow(text) for text in texts] # corpora.MmCorpus.serialize('corpus_mallet_10_3.mm', corpus) # 保存corpus # corpus = corpora.MmCorpus('corpus_wenzhang.mm') # 加载 # print(os.path.abspath('corpus.mm')) mallet_lda_model = LdaMallet(mallet_path=MALLET_PATH, corpus=corpus, num_topics=num_topics, id2word=dictionary) mallet_lda_model.save( 'C:\\Users\\asus\\Desktop\\测试\\model\\mallet_lda_model_10_3.model') #mallet_lda_model = LdaMallet.load('C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\mallet模型\\mallet_lda_model_10_3.model') topic_words20 = mallet_lda_model.show_topics(num_topics=num_topics, num_words=20) # print(topic_words20) writetopic_wordToExcleFile( topic_words20, 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\topic_words20_10_3.xls' ) topic_words = mallet_lda_model.get_topics() print(len(topic_words), len(topic_words[0])) doc_topics = txt_to_numpy(mallet_lda_model.fdoctopics()) #doc_topics_path #print(mallet_lda_model.fdoctopics()) writedoc_topicToExcleFile( doc_topics, 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\151文章\\doc_topics20_10_3' ) return texts, word_id, topic_words, doc_topics, num_topics
count_vectorizer = CountVectorizer(stop_words=stopwords.words('english')) count_vectorizer.fit(docs) doc_word = count_vectorizer.transform(docs).transpose() corpus = matutils.Sparse2Corpus(doc_word) # vocab creation word2id = dict((v, k) for v, k in count_vectorizer.vocabulary_.items()) id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items()) dictionary = corpora.Dictionary() dictionary.id2token = id2word dictionary.token2id = word2id # topic modeling ldamallet = LdaMallet(MALLET_PATH, corpus=corpus, num_topics=num_topics, id2word=id2word, iterations=400) # save topic model to file topic_file = open("english_topics_{}.pkl".format(sys.argv[1]), "wb") pickle.dump(ldamallet.show_topics(formatted=False, num_topics=num_topics), topic_file) topic_file.close() # get NPMI coherence coherence = CoherenceModel(model=ldamallet, texts=texts, dictionary=dictionary, coherence='c_npmi') print("coherence:", coherence.get_coherence())
docs_train = docs[:2000] docs_test = docs[2000:] dictionary = corpora.Dictionary(docs_train) # Filter terms that occur in more than 50% of docs dictionary.filter_extremes(no_above=0.5) # Convert to document term matrix (corpus) doc_term_mat_train = [dictionary.doc2bow(doc) for doc in docs_train] doc_term_mat_test = [dictionary.doc2bow(doc) for doc in docs_test] path_to_mallet_binary = r'C:\mallet\bin\mallet' if __name__ == "__main__": model = LdaMallet(path_to_mallet_binary, corpus=doc_term_mat_train, alpha=5, num_topics=10, id2word=dictionary, optimize_interval=50) topics = model.print_topics() for topic in topics: print(topic) # Compute Coherence Score for base model coherence_model_lda = CoherenceModel(model=model, corpus=doc_term_mat_train, texts=docs_train, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() gensim_model = ldamallet.malletmodel2ldamodel(model)
hybrid_weights.extend(hybrid_weights) hybrid_weights = np.array(hybrid_weights) # Convert to probabilities hybrid_weights = hybrid_weights / hybrid_weights.sum() # GLOBAL num_items_to_pick (with replacement) -- high number: half a million num_picks = 1000000 # LOAD MODELS loadmodstart = time() id2word_dictionary = corpora.Dictionary.load( '/home/ashwath/Programs/ACLAAn/LDA/aclmag.dict') corpus = corpora.MmCorpus( '/home/ashwath/Programs/ACLAAn/LDA/aclmag_bow_corpus.mm') try: ldamallet = LdaMallet.load( '/home/ashwath/Programs/ACLAAn/LDA/lda_model.model') vec_bow_test = id2word_dictionary.doc2bow(['test']) vec_ldamallet = ldamallet[vec_bow_test] except subprocess.CalledProcessError: print("LDA MALLET COULDN'T READ INSTANCE FILE. USING NORMAL LDA INSTEAD") ldamallet = LdaModel.load( '/home/ashwath/Programs/ACLAAn/LDA/ldanormal_acl.model') #index = similarities.MatrixSimilarity(ldamallet[corpus]) #index.save("simIndex.index") malletindex = similarities.MatrixSimilarity.load( '/home/ashwath/Programs/ACLAAn/LDA/simIndexAcl.index') with open( '/home/ashwath/Programs/ACLAAn/LDA/docid_to_magid_training_acl.pickle', 'rb') as pick: docid_to_magid = pickle.load(pick)
DICT_PATH = 'docs.dict' MODEL_PATH = 'docs.model' raw_corpus = ["Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey"] docs = [doc.split() for doc in raw_corpus] if exists(MODEL_PATH): print('Testing...\n') dict = corpora.Dictionary.load(DICT_PATH) lda = LdaMallet.load(MODEL_PATH) for doc in docs: topics = lda[dict.doc2bow(doc)] print(topics, doc) else: print('Training...\n') dictionary = corpora.Dictionary(docs) dictionary.save(DICT_PATH) corpus = [dictionary.doc2bow(text) for text in docs] lda = LdaMallet(MALLET_PATH, corpus=corpus, num_topics=3, workers=60, id2word=dictionary, iterations=50, prefix=PREFIX) lda.save(MODEL_PATH)
# --+ write plot to file out_f = os.path.join("analysis", "topicModeling", ".output", "pr_coherence_scores.pdf") plt.savefig(out_f, transparent=True, bbox_inches="tight", pad_inches=0) # %% topic model estimation """ I focus on two models: - 8 topics, ~ local optimum - 30 topic, ~ global optimum """ # model with 8 topics # --+ estimate model lda_8 = LdaMallet( mallet_path, corpus=corpus, id2word=dictionary, num_topics=8, random_seed=123 ) # --+ print topics (20 words per topic) lda_8.print_topics(num_topics=8, num_words=20) # --+ translate topic modeling outcome lda_8 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_8) # --+ term-to-topic probabilities (10 words per topic) top_terms_line = lda_8.show_topics(num_topics=8, num_words=10) # ----+ rearrange data on top 10 terms per topic top_terms_m = [] for i in top_terms_line: topic_num = i[0] prob_terms = i[1].split("+") for term_sort, term in enumerate(prob_terms): weight = float(term.split("*")[0])
def load_mallet_model(artefacts_path='./artefacts', suffix=''): model_path = str(Path(artefacts_path) / 'model') if suffix: model_path = model_path + f'_{suffix}' model = LdaMallet.load(model_path) return model
corpus = corpora.MmCorpus('mag_bow_corpus.mm') except FileNotFoundError: corpus = [ id2word_dictionary.doc2bow(textlist) for textlist in tqdm(data_stemmed) ] print("Doc2Bow corpus created") # TOO BIG TO SERIALIZE # Save the Dict and Corpus try: corpora.MmCorpus.serialize('mag_bow_corpus.mm', corpus) # save corpus to disk except OverflowError: # Don't save corpus, call LDA directly print("Overflow while saving corpus, skip and train.") ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=300, id2word=id2word_dictionary) print('LDA Model trained') try: ldamallet.save('ldamallet_mag.model') except OverflowError: print("Trying to pickle model using protocol 4") with open('ldamallet_mag.model', 'wb') as pick: pick.dump(ldamallet, pick, protocol=pickle.HIGHEST_PROTOCOL) print("Lda model saved to disk") # Show Topics pprint(ldamallet.show_topics(formatted=False)) # Compute Coherence Score
def main(): logger.info('-' * 80) logger.info('Loading data') corpus = load_corpus(args.dataset_dir) logger.info('-' * 80) logger.info('Make dictionary') dictionary = Dictionary(corpus) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=TOKEN_MIN_DOCS, no_above=TOKEN_MAX_DOCS_FRAC) vocab_path = os.path.join(args.dump_dir, 'vocab.txt') with open(vocab_path, 'w') as f: f.write("\n".join(dictionary.itervalues()) + '\n') # Bag-of-words representation of the documents. bow_corpus = [dictionary.doc2bow(doc) for doc in corpus] logger.info(f'Number of unique tokens: {len(dictionary)}') logger.info(f'Number of documents: {len(bow_corpus)}') logger.info('-' * 80) logger.info('Training model') callbacks = [] if 'perplexity' in args.callbacks: perplexity_metric = PerplexityMetric(corpus=bow_corpus) callbacks.append(perplexity_metric) if 'coherence' in args.callbacks: coherence_metric = CoherenceMetric(texts=corpus, dictionary=dictionary, coherence=args.coherence, topn=args.topn) callbacks.append(coherence_metric) model_path = os.path.join(args.dump_dir, 'lda.model') if args.model == 'lda': model = LdaModel(corpus=bow_corpus, num_topics=args.num_topics, id2word=dictionary, passes=args.num_epochs, update_every=1, eval_every=args.eval_every, iterations=args.iterations, alpha='auto', eta='auto', chunksize=args.batch_size, callbacks=callbacks, log_dir=args.log_dir, model_dir=model_path) elif args.model == 'multicore_lda': model = LdaMulticore(corpus=bow_corpus, num_topics=args.num_topics, id2word=dictionary, passes=args.num_epochs, eval_every=args.eval_every, iterations=args.iterations, eta='auto', chunksize=args.batch_size, workers=args.workers, callbacks=callbacks, log_dir=args.log_dir, model_dir=model_path) elif args.model == 'mallet_lda': model = LdaMallet(args.mallet_path, corpus=bow_corpus, num_topics=args.num_topics, id2word=dictionary, workers=args.workers, prefix=os.path.join(args.dump_dir, 'mallet_'), iterations=args.iterations) elif args.model == 'gensim_lda': model = GensimLdaModel(corpus=bow_corpus, num_topics=args.num_topics, id2word=dictionary, passes=args.num_epochs, update_every=1, eval_every=args.eval_every, iterations=args.iterations, alpha='auto', eta='auto', chunksize=args.batch_size) elif args.model == 'gensim_multicore_lda': model = GensimLdaMulticore(corpus=bow_corpus, num_topics=args.num_topics, id2word=dictionary, passes=args.num_epochs, eval_every=args.eval_every, iterations=args.iterations, eta='auto', chunksize=args.batch_size, workers=args.workers) model.save(model_path) logger.info('-' * 80) if args.model != 'mallet_lda': top_topics = model.top_topics(texts=corpus, coherence='c_v') # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / args.num_topics logger.info(f'Average topic coherence: {avg_topic_coherence:.4f}.') for topic_idx, (topic_words, topic_score) in enumerate(top_topics): logger.info(f'Topic #{topic_idx} ({topic_score:.4f}): ' + " ".join((t[1] for t in topic_words[:5]))) logger.info( f'Perplexity: {np.exp2(-model.log_perplexity(bow_corpus)):.4f}') else: pprint(model.show_topics(formatted=False)) # Compute Coherence Score coherence_model_lda = CoherenceModel(model=model, texts=corpus, dictionary=dictionary, coherence=args.coherence, topn=args.topn) coherence_lda = coherence_model_lda.get_coherence() logger.info(f'Coherence : {coherence_lda:.4f}')
def extract_features(max_documents=50000000, max_words_per_doc=50000000, incl_tf=True, incl_df=True, incl_graph=True, incl_w2v=True, incl_topic_model=True, incl_atm=True): ######### SIMPLE FREQUENCY MEASURES ###################################################### if incl_df or incl_tf or incl_graph: doc_cnt = max_documents # set containers: tf, df, network = Counter(), Counter(), nx.Graph() doc_ner_idx = {} dir_ner_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified', max_documents=max_documents, max_words_per_doc=max_words_per_doc, get='wiki') dir_filename_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified', max_documents=max_documents, max_words_per_doc=max_words_per_doc, get='filename') for filename, words in zip(dir_filename_iterator, dir_ner_iterator): # count the ners: ner_cnt = Counter() ner_cnt.update(words) if ner_cnt: # collect which ners appear in which doc: doc_ner_idx[os.path.basename(filename)] = set([n for n in ner_cnt]) # update global tf and df: for k, v in ner_cnt.items(): tf[k] += v df[k] += 1 # update nodes in network: for ner in ner_cnt: if ner not in network: network.add_node(ner) # update edges in network: for ner1, ner2 in combinations(ner_cnt, 2): try: network[ner1][ner2]['weight'] += 1 except KeyError: network.add_edge(ner1, ner2, weight=1) # dump for reuse: pickle.dump(tf, open('../workspace/tf.m', 'wb')) pickle.dump(df, open('../workspace/df.m', 'wb')) pickle.dump(doc_ner_idx, open('../workspace/doc_ner_idx.m', 'wb')) pickle.dump(network, open('../workspace/nx.m', 'wb')) # scale network values: max_weight = float(max([network[n1][n2]['weight']\ for n1, n2 in network.edges_iter()])) for n1, n2 in network.edges_iter(): network[n1][n2]['weight'] /= max_weight nx.write_gexf(network, '../workspace/dbnl_network.gexf', prettyprint=True) ######### WORD2VEC MODEL ###################################################### if incl_w2v: # build w2v model: dir_w2v_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified', max_documents=max_documents, max_words_per_doc=max_words_per_doc, get='w2v') w2v_model = Word2Vec(dir_w2v_iterator, window=15, min_count=10, size=150, workers=10, negative=5) w2v_model.init_sims(replace=True) w2v_model.save(os.path.abspath('../workspace/w2v_model.m')) ######### STANDARD TOPIC MODEL ###################################################### if incl_topic_model: # build vocab for lda: vocab_lda_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified', max_documents=max_documents, max_words_per_doc=max_words_per_doc, get='lda_vocab') lda_dict = corpora.Dictionary(vocab_lda_iterator) lda_dict.filter_extremes(no_below=25, no_above=0.5, keep_n=5000) # build lda model: dir_lda_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified', max_documents=max_documents, max_words_per_doc=max_words_per_doc, get='lda', lda_dict=lda_dict) lda_workspace_path = '../workspace/mallet_output/' if not os.path.isdir(lda_workspace_path): os.mkdir(lda_workspace_path) mallet_path = '/home/mike/GitRepos/dbnl/code/mallet-2.0.8RC2/bin/mallet' lda_model = LdaMallet(mallet_path, dir_lda_iterator, num_topics=150, id2word=lda_dict, iterations=1900, prefix=lda_workspace_path) lda_model.save('../workspace/lda_model.m') ######### AUTHOR TOPIC MODEL ###################################################### if incl_atm: # build vocab for lda: vocab_lda_iterator = DirectoryIterator(path_pattern='../workspace/wikified_periodicals/*.wikified', max_documents=max_documents, max_words_per_doc=max_words_per_doc, get='lda_vocab') lda_dict = corpora.Dictionary(vocab_lda_iterator) lda_dict.filter_extremes(no_below=25, no_above=0.5, keep_n=5000) lda_dict.compactify() atm_vocab = [] for i, w in lda_dict.items(): atm_vocab.append(w) print(len(atm_vocab), 'vocab') atm_vocab = tuple(atm_vocab) corpus, doc_author = [], [] for filename in sorted(glob.glob('../workspace/wikified_periodicals/*.wikified')): doc_words, auth_set = [], set() max_documents -= 1 if max_documents % 100 == 0: print('\t-', max_documents, 'to go') if max_documents <= 1: break word_cnt = max_words_per_doc for line in codecs.open(filename, 'r', encoding='utf8'): comps = line.strip().split('\t') if comps: idx, token, lemma, pos, pos_conf, ner, wiki = comps if wiki != 'X': auth_set.add(wiki) elif pos.startswith(('N(', 'ADJ(')): try: doc_words.append(atm_vocab.index(token.lower())) except: pass word_cnt -= 1 if word_cnt <= 0: break if auth_set and doc_words: corpus.append(sorted(doc_words)) doc_author.append(sorted(list(auth_set))) atm_author_idx = {} for i1, authors in enumerate(doc_author): for i2, auth in enumerate(authors): if auth not in atm_author_idx: atm_author_idx[auth] = len(atm_author_idx) doc_author[i1][i2] = atm_author_idx[auth] n_topic = 30 atm_model = AuthorTopicModel(n_doc=len(corpus), n_voca=len(atm_vocab), n_topic=n_topic, n_author=len(atm_author_idx)) atm_model.fit(corpus, doc_author, max_iter=10) for k in range(n_topic): top_words = get_top_words(atm_model.TW, atm_vocab, k, 10) print('topic ', k , ','.join(top_words)) author_id = 7 fig = plt.figure(figsize=(12,6)) plt.bar(range(n_topic), atm_model.AT[author_id]/np.sum(atm_model.AT[author_id])) #plt.title(author_idx[author_id]) plt.xticks(np.arange(n_topic)+0.5, ['\n'.join(get_top_words(atm_model.TW, atm_vocab, k, 10)) for k in range(n_topic)]) #plt.show() plt.savefig('atm1.pdf') pickle.dump(atm_vocab, open('../workspace/atm_vocab.m', 'wb')) pickle.dump(atm_model, open('../workspace/atm_model.m', 'wb')) pickle.dump(atm_author_idx, open('../workspace/atm_author_idx.m', 'wb'))
# Create the vocabulary for ii in files: doc_scanner.scan(tokenize_file(ii)) # Initialize the documents docs = doc_scanner.docs dictionary = Dictionary(docs) corpus = [dictionary.doc2bow(doc) for doc in docs] # start = time.time() # gensim_lda = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=args.num_topics, iterations=args.num_iterations) # time_took = time.time() - start # report(gensim_lda.print_topics(num_topics=10, num_words=50), filename="gensim", limit=50) # print(("Total time it took: %0.5f seconds" % (time_took))) mallet_file = "/home/jihwangk/Desktop/GitDir/Mallet/bin/mallet" # start = time.time() mallet_lda = LdaMallet(mallet_file, corpus=corpus, num_topics=args.num_topics, id2word=dictionary, iterations=args.num_iterations) # time_took = time.time() - start mallet_lda = gensim.models.wrappers.ldamallet.malletmodel2ldamodel( mallet_lda, iterations=args.num_iterations) report(mallet_lda.print_topics(num_topics=10, num_words=50), filename="mallet", limit=50) # print(("Total time it took: %0.5f seconds" % (time_took)))
def topic_model(W, K, N): """ :param w: min_number of words per segment :param k: number of topics :param n: number of iterations :return: """ print("\n-----LDA CONCEPT DETECITON-----") print('MODEL:', hash((W, K, N)), W, K, N) corpus = load_from_csv(CORPUS_PATH) # Create CountVectorizer to get Document-Term matrix stop_words = load_stop_words("../data/stopwords-fr.txt") vectorizer = CountVectorizer(lowercase=True, max_df=MAX_DF, min_df=MIN_DF, token_pattern=r"(?u)\b\w\w\w+\b") proc_corpus, proc_corpus_text_only = remove_short_segs( corpus, vectorizer, W) proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only] proc_stop_words = [] for i in range(len(proc_corpus_text_only)): proc_stop_words.append([]) for j in range(len(proc_corpus_text_only[i])): if proc_corpus_text_only[i][j] not in stop_words and len( proc_corpus_text_only[i][j]) >= 3: proc_stop_words[i].append(proc_corpus_text_only[i][j]) # train vectorizer on corpus print('Corpus Size:', len(proc_stop_words)) id2word = Dictionary(proc_stop_words) corp = [id2word.doc2bow(text) for text in proc_stop_words] # print("Number of Features: " + str(len(feature_names))) # redirect stdout for capturing LL/token # initialize model path_to_mallet_binary = "../mallet_git/bin/mallet" mallet_model = LdaMallet(path_to_mallet_binary, corpus=corp, num_topics=K, id2word=id2word, optimize_interval=20, random_seed=9, iterations=N) u_mass = CoherenceModel(model=mallet_model, texts=proc_stop_words, corpus=corp, coherence='u_mass') c_v = CoherenceModel(model=mallet_model, texts=proc_stop_words, corpus=corp, coherence='c_v') c_uci = CoherenceModel(model=mallet_model, texts=proc_stop_words, corpus=corp, coherence='c_uci') c_npmi = CoherenceModel(model=mallet_model, texts=proc_stop_words, corpus=corp, coherence='c_npmi') u_mass_val = u_mass.get_coherence() c_v_val = c_v.get_coherence() c_uci_val = c_uci.get_coherence() c_npmi_val = c_npmi.get_coherence() print('U_MASS_VAL:', u_mass_val) print('C_V_VAL:', c_v_val) print('C_UCI_VAL:', c_uci_val) print('C_NPMI_VAL:', c_npmi_val) return 0
#Approach 2 !wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip !unzip mallet-2.0.8.zip def install_java(): !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null #install openjdk os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" #set environment variable !java -version #check java version install_java() os.environ['MALLET_HOME'] = '/content/mallet-2.0.8' mallet_path = '/content/mallet-2.0.8/bin/mallet' #create model ldamallet = LdaMallet(mallet_path, corpus=doc_term_matrix, num_topics=8, id2word=dictionary) pprint(ldamallet.show_topics(formatted=False)) gensimmodel= gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet) #create wrapper for visualization ldamallet_display = pyLDAvis.gensim.prepare(gensimmodel, doc_term_matrix, dictionary, sort_topics=False) pyLDAvis.save_html(ldamallet_display,open("ldamallet_8_topics.html","w")) # Compute Coherence Score coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=reviews, dictionary=dictionary, coherence='c_v') coherence_ldamallet = coherence_model_ldamallet.get_coherence() print('\n Mallet Coherence Score: ', coherence_ldamallet) #Generate Tags def get_reviews_to_process(text):
def main(): print("\n-----LDA CONCEPT DETECITON-----") corpus = load_from_csv(CORPUS_PATH) # Create CountVectorizer to get Document-Term matrix stop_words = load_stop_words("data/stopwords-fr.txt") vectorizer = CountVectorizer(lowercase=True, max_df=MAX_DF, min_df=MIN_DF, token_pattern=r"(?u)\b\w\w\w+\b") proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer) proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only] proc_stop_words = [] for i in range(len(proc_corpus_text_only)): proc_stop_words.append([]) for j in range(len(proc_corpus_text_only[i])): if proc_corpus_text_only[i][j] not in stop_words and len(proc_corpus_text_only[i][j]) >= 3: proc_stop_words[i].append(proc_corpus_text_only[i][j]) # train vectorizer on corpus id2word = Dictionary(proc_stop_words) corp = [id2word.doc2bow(text) for text in proc_stop_words] # print("Number of Features: " + str(len(feature_names))) # initialize model path_to_mallet_binary = "/Users/fnascime/Dev/mallet/mallet-2.0.8/bin/mallet" coherence_values = [] for seed in range(20): mallet_model = LdaMallet(path_to_mallet_binary, corpus=corp, num_topics=16, id2word=id2word, optimize_interval=20, random_seed=seed, iterations=1000) gensim_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(mallet_model) coherencemodel = CoherenceModel(model=gensim_model, texts=proc_stop_words, dictionary=id2word, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) max = 0 best_seed = 999 for index, coherence in enumerate(coherence_values) : print ("Seed: ", index, " -> Coherence: ", coherence) if coherence > max: max = coherence best_seed = index print (" *** Summary ***") print (" Best Seed : ", best_seed) print ("Best coherence : ", max) print ("Median : ", median(coherence_values)) print ("Mean : ", mean(coherence_values)) print ("Stdev : ", stdev(coherence_values)) #doc_topics = list(mallet_model.read_doctopics(mallet_model.fdoctopics(), renorm=False)) #topic_word = TopicWord(mallet_model) #topic_word.get_topic_word() #topic_word.write_to_csv("output/topic_" +str(mallet_model.random_seed) + "_" + str(mallet_model.iterations) + "_" + str(mallet_model.num_topics) + ".csv") #topic_doc = TopicDoc(mallet_model) #topic_doc.get_topic_doc() #topic_doc.write_to_csv("output/topic_doc"+str(mallet_model.random_seed)+ "_" + str(mallet_model.iterations)+ "_" + str(mallet_model.num_topics) + ".csv", num_docs=50) return 0
def lda_mallet(mallet_path, corpus, dictionary, num_topics): lda_model = LdaMallet(mallet_path, corpus=corpus, id2word=dictionary, num_topics=num_topics) return lda_model
# # 9 LDA Mallet Model # Now that we have completed our Topic Modeling using "Variational Bayes" algorithm from Gensim's LDA, we will now explore Mallet's LDA (which is more accurate but slower) using Gibb's Sampling (Markov Chain Monte Carlos) under Gensim's Wrapper package. # # Mallet's LDA Model is more accurate, since it utilizes Gibb's Sampling by sampling one variable at a time conditional upon all other variables. # In[20]: import os from gensim.models.wrappers import LdaMallet os.environ.update({'MALLET_HOME':r'/Users/Mick/Desktop/mallet/'}) # Set environment mallet_path = '/Users/Mick/Desktop/mallet/bin/mallet' # Update this path # Build the LDA Mallet Model ldamallet = LdaMallet(mallet_path,corpus=corpus,num_topics=7,id2word=id2word) # Here we selected 7 topics again pprint(ldamallet.show_topics(formatted=False)) # After building the LDA Mallet Model using Gensim's Wrapper package, here we see our 7 new topics in the document along with the top 10 keywords and their corresponding weights that makes up each topic. # ## 9.1 LDA Mallet Model Performance # In[21]: # Compute coherence score coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence="c_v") coherence_ldamallet = coherence_model_ldamallet.get_coherence() print('\nCoherence Score: ', coherence_ldamallet)
import gensim from gensim.models.wrappers import LdaMallet # If mallet doesn't work, use normal LDA. from gensim.models.ldamodel import LdaModel ldamallet = LdaMallet.load( '/home/ashwath/Programs/MAGCS/LDA/ldamallet_mag50.model') lda = gensim.models.wrappers.ldamallet.malletmodel2ldamodel( ldamallet, gamma_threshold=0.001, iterations=50) lda.save('lda_mag50.model')
print( 'Train an LDA model over the given corpus using the given dictionary.' ) print('If num_topics is not specified, use the default of 100.') print( 'If num_passes is specified, makes multiple passes over the corpus.' ) print('This uses MALLET to train a topic model.') else: _, mm_fname, dict_fname, model_fname = sys.argv[:4] num_topics = int(sys.argv[4]) if len(sys.argv) >= 5 else 100 try: mallet_path = sep.join( [os.environ['MALLET_HOME'], 'bin', 'mallet']) except KeyError: logging.error('please set the MALLET_HOME environment variable to ' 'the root directory of your MALLET installation') exit() mm = MmCorpus(mm_fname) id2word = Dictionary.load(dict_fname) lda_model = LdaMallet(mallet_path, corpus=normalize_langs(mm), id2word=id2word, num_topics=num_topics, prefix=model_fname[:-6], iterations=100) lda_model.save(model_fname)
def load_gensim_file(file_name): return LdaMallet.load('data/gensim_models/' + file_name)
class NlPipe: def __init__(self, list_of_docs, path, document_ids=None, language_model="en_core_web_lg", tagger=False, parser=False, ner=False, categorization=False, remove_stopwords=True, remove_punctuation=True, set_lower=True, remove_num=True, expand_stopwords=True, language_detection=False, allowed_languages=frozenset({'en'}), no_processes=None): """ :param list_of_docs: List of strings where every document is one string. :param document_ids: The ids of the documents, matching the order of the list_of_docs :param language_model: Spacy language model to be used for text preprocessing :param tagger: Use spacy part-of-speech tagger. :param parser: Use spacy to annotate syntactic dependencies in documents. :param ner: Use spacy for entity recognition and annotation. :param categorization: Use spacy to assign document labels :param remove_stopwords: Remove stop words during text preprocessing. :param remove_punctuation: Remove punctuation during text prssing. :param set_lower: Convert all strings to lowercase during text preprocessing. :param remove_num: Remove numeric characters during text preprocessing. :param expand_stopwords: Remove non-alpha-characters in stop words and add them to the stop words. :param language_detection: Detect language of docs. :param allowed_languages: Allowed language for the documents. """ self.path = path self.pipe_disable = [] if not tagger: self.pipe_disable.append("tagger") if not parser: self.pipe_disable.append("parser") if not ner: self.pipe_disable.append("ner") if not categorization: self.pipe_disable.append("textcat") self.remove_punctuation = remove_punctuation self.remove_stop_words = remove_stopwords self.remove_num = remove_num self.set_lower = set_lower self.input_docs = list_of_docs self.document_ids = np.array(document_ids) self.use_gpu = spacy.prefer_gpu() self.nlp = spacy.load(language_model) if expand_stopwords: stops = [stop for stop in self.nlp.Defaults.stop_words] for stop in stops: self.nlp.Defaults.stop_words.add(re.sub(r"[\W]", "", stop)) self.spacy_docs = None self.preprocessed_docs = None self.bag_of_words = None self.preprocessing_batch_size = 50000 if no_processes is None: self.processes = psutil.cpu_count(logical=False) - 1 else: self.processes = no_processes self.lda_model = None self.result_df = None self.word_topic_df = None self.allowed_languages = allowed_languages self.language_detection = language_detection self.id2word = None self.coherence_dict = None self.max_df = None self.min_df = None self.use_phrases = None self.filter_extremes_value = None self.keep_n = None self.keep_tokens = None def enable_pipe_component(self, component): """ Method to enable components of the spacy pipeline after initialization of the class. :param component: Component to enable (see https://spacy.io/usage/processing-pipelines/ for available components). """ if component in self.pipe_disable: self.pipe_disable.remove(component) def disable_pipe_component(self, component): """ Method to disable components of the spacy pipeline after initialization of the class. :param component: Component to disable (see https://spacy.io/usage/processing-pipelines/ for available components). """ if component not in self.pipe_disable: self.pipe_disable.append(component) def preprocess_spacy(self, load_existing=True, save_data=True, filter_loaded=None): """ Method to preprocess the documents using spacy with the enabled pipeline components. """ if os.path.exists( f"{self.path}text_df_preprocessed_spacy") and load_existing: preprocessed_df = pd.read_pickle( f"{self.path}text_df_preprocessed_spacy") if filter_loaded is None: self.spacy_docs = preprocessed_df['preprocessed_text'].to_list( ) else: self.spacy_docs = preprocessed_df['preprocessed_text'].loc[ filter_loaded].to_list() else: if self.language_detection: self.spacy_docs = [ doc for doc in tqdm(self.nlp.pipe( self.input_docs, disable=self.pipe_disable, n_process=self.processes, batch_size=self.preprocessing_batch_size), desc="Preprocessing text with spacy: ") if detect(doc.text) in self.allowed_languages ] else: self.spacy_docs = [] for doc in tqdm(self.nlp.pipe( self.input_docs, disable=self.pipe_disable, n_process=self.processes, batch_size=self.preprocessing_batch_size), desc="Preprocessing spacy"): self.spacy_docs.append(doc) if save_data: temp_df = pd.DataFrame([self.document_ids, self.spacy_docs]).transpose() temp_df.columns = ['thread_id', 'preprocessed_text'] temp_df.to_pickle(f"{self.path}text_df_preprocessed_spacy") def preprocess(self, load_existing=True, filter_loaded=None): """ Remove stop words, numbers and punctation as well as lower case all of the tokens, depending on the settings passed to the class during initialization. """ if os.path.exists( f"{self.path}/text_df_preprocessed") and load_existing: print("Found preprocessed data. Loading") preprocessed_df = pd.read_pickle( f"{self.path}/text_df_preprocessed") if filter_loaded is None: self.preprocessed_docs = preprocessed_df[ 'preprocessed_text'].to_list() print('Preprocessed data loaded.') else: self.preprocessed_docs = preprocessed_df[ 'preprocessed_text'].loc[filter_loaded].to_list() if isinstance(self.document_ids, np.ndarray): self.document_ids = self.document_ids[filter_loaded] print( f'{sum(filter_loaded)} preprocessed docs of {len(self.input_docs)} docs loaded.' ) else: self.preprocessed_docs = [] if not self.spacy_docs: self.preprocess_spacy() for spacy_doc in tqdm( self.spacy_docs, desc="Removing stop words/punctuation/numeric chars: "): doc = [] for token in spacy_doc: # todo: check if useful condition if not self.remove_stop_words and token.is_stop: word = token.text elif token.is_stop: continue else: word = token.lemma_ if self.set_lower: word = word.lower() if self.remove_num: word = re.sub(r"[\d]", "", word) if self.remove_punctuation: word = re.sub(r"[\W]", "", word) if len(word) >= 2 and word != "wbr": doc.append(word) self.preprocessed_docs.append(doc) temp_df = pd.DataFrame([self.document_ids, self.preprocessed_docs]).\ transpose() temp_df.columns = ['thread_id', 'preprocessed_text'] temp_df.to_pickle(f"{self.path}/text_df_preprocessed") def create_bag_of_words(self, filter_extremes=True, min_df=5, max_df=0.5, keep_n=100000, keep_tokens=None, use_phrases=None, bigram_min_count=1000, bigram_threshold=100, trigram_threshold=100, load_existing=True, tfidf=False): """ :param filter_extremes: En-/Disable filtering of tokens that occur too frequent/not frequent enough (https://radimrehurek.com/gensim/corpora/dictionary.html) :param min_df: Keep only tokens that appear in at least n documents (see link above) :param max_df: Keep only tokens that appear in less than the fraction of documents (see link above) :param keep_n: Keep only n most frequent tokens (see link above) :param keep_tokens: Iterable of tokens not to be remove (see link above) :param use_phrases: Set to bigram or trigram if the use of Gensmin Phrases (https://radimrehurek.com/gensim/models/phrases.html) is wanted. Will create bigrams/trigrams of frequently co-occuring words (e.g. "new", "york" => "new_yor)k"). :param bigram_min_count: Minimum occurrence of bigrams to be considered by Gensmin Phrases. :param bigram_threshold: Threshold for Gensim Phrases bigram settings. :param trigram_threshold: Threshold for Gensim Phrases trigram settings. """ if use_phrases not in {None, "bigram", "trigram"}: raise Exception( "Please use valid option (None, 'bigram' or 'trigram) to make use of this function." ) #todo: check logic else: if use_phrases == "bigram" and not isinstance( bigram_threshold, int) and not isinstance( bigram_min_count, int): raise Exception( "Thresholds or minimum count for bigrams/trigrams not integer. Please provide " "threshold and minimum count for bigrams (and trigrams) as integer." ) elif use_phrases == "trigram" and not isinstance(bigram_threshold, int) \ or not isinstance(trigram_threshold, int) or not isinstance(bigram_min_count, int): raise Exception( "Thresholds or minimum count for bigrams/trigrams not integer. Please provide " "threshold and minimum count for bigrams (and trigrams) as integer." ) if not self.preprocessed_docs: self.preprocess() if os.path.exists(f"{self.path}/gensim_dict_{filter_extremes}_{min_df}_{max_df}_{use_phrases}") \ and load_existing: self.load_dict( path= f"{self.path}/gensim_dict_{filter_extremes}_{min_df}_{max_df}_{use_phrases}" ) self.filter_extremes_value = filter_extremes self.min_df = min_df self.max_df = max_df self.use_phrases = use_phrases else: #todo: add auto check for existing dictionary here. if use_phrases == "bigram" or use_phrases == "trigram": self.create_bigrams(bigram_min_count=bigram_min_count, bigram_threshold=bigram_threshold) if use_phrases == "trigram": self.create_bigrams(bigram_min_count=bigram_min_count, bigram_threshold=bigram_threshold) self.create_trigrams(trigram_threshold=trigram_threshold) self.create_dictionary(filter_extremes=filter_extremes, min_df=min_df, max_df=max_df, keep_n=keep_n, keep_tokens=keep_tokens, use_phrases=use_phrases) self.create_bag_of_words_matrix(tfidf=tfidf) def create_bigrams(self, bigram_min_count, bigram_threshold): self.bigram_phrases = Phrases(self.preprocessed_docs, min_count=bigram_min_count, threshold=bigram_threshold) self.bigram_phraser = Phraser(self.bigram_phrases) self.preprocessed_docs = [ self.bigram_phraser[doc] for doc in tqdm(self.preprocessed_docs, desc="Extracting bigrams") ] def create_trigrams(self, trigram_threshold): trigram_phrases = Phrases(self.bigram_phrases[self.preprocessed_docs], threshold=trigram_threshold) trigram_phraser = Phraser(trigram_phrases) self.preprocessed_docs = [ trigram_phraser[self.bigram_phraser[doc]] for doc in tqdm(self.preprocessed_docs, desc="Extracting trigrams") ] def create_bag_of_words_matrix(self, tfidf=False): self.bag_of_words = [ self.id2word.doc2bow(doc) for doc in tqdm(self.preprocessed_docs, desc='Creating bag of words') ] if tfidf: self.create_tfidf() def create_dictionary(self, filter_extremes, min_df, max_df, keep_n, keep_tokens, use_phrases): print('Creating dictionary.') self.id2word = corpora.Dictionary(self.preprocessed_docs) # todo: add autosave of dictionary here self.max_df = max_df self.min_df = min_df self.use_phrases = use_phrases self.filter_extremes_value = filter_extremes self.keep_n = keep_n self.keep_tokens = keep_tokens if filter_extremes: self.filter_extremes(min_df=self.min_df, max_df=self.max_df, keep_n=self.keep_n, keep_tokens=self.keep_tokens) self.save_dict( path= f"{self.path}/gensim_dict_{filter_extremes}_{min_df}_{max_df}_{use_phrases}" ) def filter_extremes(self, min_df, max_df, keep_n, keep_tokens=[]): self.filter_extremes_value = True self.max_df = max_df self.min_df = min_df self.keep_n = keep_n self.keep_tokens = keep_tokens self.id2word.filter_extremes(no_below=self.min_df, no_above=self.max_df, keep_n=keep_n, keep_tokens=keep_tokens) def create_tfidf(self): tfidf_model = TfidfModel(self.bag_of_words) self.bag_of_words = [ tfidf_model[vector] for vector in tqdm(self.bag_of_words, desc="Creating tf-idf matrix") ] def create_lda_model(self, no_topics=10, random_state=42, passes=5, alpha='auto', eta=None, workers=None, chunksize=2000): """ :param no_topics: Number of topics that are to be explored by lda model :param random_state: Random state for reproducible results (default 42, gensim default is None) :param passes: Number of times the whole corpus is processed. :param alpha: set topic-document distribution prior alpha to "symmetric" or "asymmetric" (gensim default is "symmetric") :param eta: Word-topic distribution prior eta (beta) :param workers: number of workers to use. Defaulting to one as there seems to be a bug in gensim. 1 already uses all available cores. Higher number of workers results in a load bigger than the number of cores. :param chunksize: chunsize parameter of gensim """ if eta is None: eta = 1 / no_topics if workers is None: workers = self.processes if self.bag_of_words is None: self.create_bag_of_words() self.lda_model = LdaMulticore(corpus=self.bag_of_words, id2word=self.id2word, num_topics=no_topics, eta=eta, workers=workers, random_state=random_state, alpha=alpha, passes=passes, chunksize=chunksize) def create_mallet_lda_model(self, no_topics, random_state=42, workers=None, mallet_path="mallet-2.0.8/bin/mallet", iterations=1000, custom_prefix=None): """ Method to create a mallet lda model using gensim wrapper for lda mallet :param no_topics: Number of topics for lda model :param random_state: Random state to be able to reprocude model creation :param workers: Number of workers to use :param mallet_path: path to mallet binary, e.g. "mallet-2.0.8/bin/mallet" :param iterations: iterations over the corpus?! """ if workers is None: workers = self.processes if self.bag_of_words is None: self.create_bag_of_words() if custom_prefix is None: prefix = f"{self.path}mallet_temp_" else: prefix = f"{self.path}mallet_temp_{custom_prefix}_" self.lda_model = LdaMallet(num_topics=no_topics, mallet_path=mallet_path, corpus=self.bag_of_words, id2word=self.id2word, random_seed=random_state, iterations=iterations, workers=workers, prefix=prefix) def calculate_coherence(self, model=None, coherence_score='c_v', workers=None): """ Method to calculate the coherence score of a given lda model. The model can either be provided or will be taken from the class. :param model: Model to use instead of the model saved within the class. :param coherence_score: Coherence score to calculate :param workers: Number of workers to use for coherence evaluation. :return: Return coherence model, which also contains the coherence score of a model. """ if workers is None: workers = self.processes if model is None: model = self.lda_model else: model = model if coherence_score != 'u_mass': coherence_model = CoherenceModel(model=model, texts=self.preprocessed_docs, dictionary=self.id2word, coherence=coherence_score, processes=workers) else: coherence_model = CoherenceModel(model=model, corpus=self.bag_of_words, dictionary=self.id2word, coherence=coherence_score, processes=workers) return coherence_model def search_best_model(self, topic_list=frozenset({2, 3, 4, 5, 10, 15, 20, 25}), alphas=[0.9, 0.5, 0.1], etas=['auto', 0.9, 0.5, 0.1], save_best_model=True, save_models=False, return_best_model=False, passes=1, coherence_scores=['c_v'], chunksize=2000, workers=None, coherence_suffix=None): #todo: save best model within class. """ Method to search for the best lda model for a given number of topics. The best model will be determined by its coherence score. :param topic_list: Iterable of integers of topics to test the coherence score for. :param alphas: Iterable of floats between 0 and 1 for determining the dirichlet prior of the lda model. :param save_best_model: Set to true if the best model has to be saved within the class. :param save_models: If set to false (default) only the coherence score for each combination of numbers of topics and alphas will be saved. If set to true, the lda model, the coherence score and the coherence model will be saved. :param return_best_model: If true, the method will return the best found model and the number of topics of this model. :return: Number of topics for the best result and the model with the best result of the coherence score """ if coherence_suffix is None: path = f"{self.path}coherence_results" else: path = f"{self.path}coherence_results_{coherence_suffix}" if os.path.exists(path): print("coherence results found") with open(path, "rb") as f: self.coherence_dict = pickle.load(f) else: self.coherence_dict = {} if workers is None: workers = self.processes if return_best_model and not save_best_model: raise Exception( "To return the best model, the parameter save_best_model has to be set to True." ) if self.coherence_dict and save_best_model: try: best_score = self.coherence_dict['best_score'] except: best_score = 0 else: best_score = 0 for no_topics in tqdm(topic_list, desc="Calculating topic coherences: "): for alpha in tqdm(alphas, desc='Alphas'): for eta in tqdm(etas, desc='Etas'): coherence_key = f"no={no_topics}-a={alpha}-e={eta}-filter={self.filter_extremes_value}" \ f"-min_df={self.min_df}-max_df={self.max_df}-phrases={self.use_phrases}" \ f"-k_n={self.keep_n}-k_t={self.keep_tokens}" if coherence_key in self.coherence_dict.keys(): print("coherence value found, skipping") continue else: self.create_lda_model(no_topics=no_topics, alpha=alpha, eta=eta, passes=passes, chunksize=chunksize, workers=workers) self.coherence_dict[coherence_key] = {} if save_models: self.coherence_dict[coherence_key][ "lda_model"] = self.lda_model for coherence_score in coherence_scores: coherence_model = self.calculate_coherence( coherence_score=coherence_score, workers=workers) coherence_result = coherence_model.get_coherence() if save_models: self.coherence_dict[coherence_key][ "coherence_model"] = coherence_model self.coherence_dict[coherence_key][ coherence_score] = coherence_result if save_best_model and coherence_result > best_score: self.coherence_dict[ "best_score"] = coherence_result self.coherence_dict[ "best_model"] = self.lda_model self.coherence_dict[ "best_topic_no"] = no_topics self.coherence_dict["best_alpha"] = alpha self.coherence_dict["best_eta"] = eta if coherence_result > best_score: best_score = coherence_result with open(path, "wb") as f: pickle.dump(self.coherence_dict, f) if return_best_model: #returns number of topics and the lda_model return self.coherence_dict["best_topic_no"], self.coherence_dict[ "best_model"] def search_best_model_mallet(self, topic_list=frozenset( {2, 3, 4, 5, 10, 15, 20, 25}), save_best_model=True, save_models=False, return_best_model=False, coherence_scores=['c_v'], workers=None, coherence_workers=None, coherence_suffix=None, random_state=42, mallet_path="mallet-2.0.8/bin/mallet", iterations=1000): """ :param topic_list: :param save_best_model: :param save_models: :param return_best_model: :param coherence_scores: :param workers: :param coherence_suffix: :param random_state: :param mallet_path: :param iterations: :return: """ if coherence_suffix is None: path = f"{self.path}coherence_results_mallet" else: path = f"{self.path}coherence_results_mallet_{coherence_suffix}" if os.path.exists(path): print("coherence results found") with open(path, "rb") as f: self.coherence_dict = pickle.load(f) else: self.coherence_dict = {} if workers is None: workers = self.processes if coherence_workers is None: coherence_workers = self.processes if return_best_model and not save_best_model: raise Exception( "To return the best model, the parameter save_best_model has to be set to True." ) if self.coherence_dict and save_best_model: try: best_score = self.coherence_dict['best_score'] except: best_score = 0 else: best_score = 0 for no_topics in tqdm(topic_list, desc="Calculating topic coherences: "): coherence_key = f"mallet-no={no_topics}-filter={self.filter_extremes_value}" \ f"-min_df={self.min_df}-max_df={self.max_df}-phrases={self.use_phrases}" \ f"-k_n={self.keep_n}-k_t={self.keep_tokens}" if coherence_key in self.coherence_dict.keys(): print("coherence value found, skipping") continue else: self.create_mallet_lda_model(no_topics=no_topics, workers=workers, random_state=random_state, mallet_path=mallet_path, iterations=iterations) self.coherence_dict[coherence_key] = {} if save_models: self.coherence_dict[coherence_key][ "lda_model"] = self.lda_model for coherence_score in coherence_scores: coherence_model = self.calculate_coherence( coherence_score=coherence_score, workers=coherence_workers) coherence_result = coherence_model.get_coherence() if save_models: self.coherence_dict[coherence_key][ "coherence_model"] = coherence_model self.coherence_dict[coherence_key][ coherence_score] = coherence_result if save_best_model and coherence_result > best_score: self.coherence_dict["best_score"] = coherence_result self.coherence_dict["best_model"] = self.lda_model self.coherence_dict["best_topic_no"] = no_topics self.coherence_dict[ "best_alpha"] = self.lda_model.alpha if coherence_result > best_score: best_score = coherence_result with open(path, "wb") as f: pickle.dump(self.coherence_dict, f) if return_best_model: #returns number of topics and the lda_model return self.coherence_dict["best_topic_no"], self.coherence_dict[ "best_model"] def create_document_topic_df(self, model=None, no_topics=10): """ Creates a dataframe containing the the result of the LDA model for each document. Will set the topic with the highest share within the document as the dominant topic. :param model: LDA model to use for the calculation of the topic distribution of each document. :param no_topics: Number of topics in case no LDA model is provided. """ if model is None: model = self.lda_model if isinstance(model, LdaMallet): model = malletmodel2ldamodel(model) topic_result_list = [] for doc in model.get_document_topics(bow=self.bag_of_words): temp_dict = {} for topic, probability in doc: temp_dict[topic] = probability topic_result_list.append(temp_dict) self.result_df = pd.DataFrame(data=topic_result_list, columns=range(model.num_topics)) self.result_df = self.result_df.fillna(0) if self.document_ids is not None and not self.language_detection: self.result_df.index = self.document_ids elif self.document_ids is not None and self.language_detection: raise Warning( "Using document ids and language detection together is not implemented (yet)." ) dominant_topic = np.argmax(self.result_df.values, axis=1) self.result_df['dominant_topic'] = dominant_topic def plot_document_topic_distribution(self): #todo: log normalize if self.result_df is None: raise Exception( "Please create the topic distribution dataframe using the 'create_document_topic_df' " "method") counter = Counter(self.result_df.dominant_topic) topic_dict = OrderedDict( sorted(counter.items(), key=lambda x: x[1], reverse=True)) plt.figure(figsize=(10, 6)) g = sns.barplot(x=list(topic_dict.values()), y=list(topic_dict.keys()), order=list(topic_dict.keys()), orient='h') g.set_ylabel("topic number") g.set_xlabel("count") plt.show() def evaluate_model(self, no_words=30): #todo: update 4 gensim keywords = np.array(self.vectorizer.get_feature_names()) topic_keywords = [] for topic_weights in self.lda_model.components_: top_keyword_locations = (-topic_weights).argsort()[:no_words] topic_keywords.append(keywords.take(top_keyword_locations)) self.word_topic_df = pd.DataFrame( topic_keywords, columns=[f"word_{x}" for x in range(no_words)]) def evaluate_pyldavis(self, model=None, use_jupyter=None): """ Method for a visual evaluation of the LDA topic model using pyldavis. :param model: LDA model that is to be evaluated. If 'None', it will use the last model that has been saved within the class. :param use_jupyter: set how the pyldavis panel is displayed. If default (None), it will try to find out if run from jupyter and set the method accordingly :return: """ if model is None: if self.lda_model is None: raise Exception( "Please create a LDA model for evaluation before running this method." ) model = self.lda_model if isinstance(model, LdaMallet): model = malletmodel2ldamodel(model) panel = pyLDAvis.gensim.prepare(model, self.bag_of_words, self.id2word) if use_jupyter is None: try: is_jupyter = os.environ['_'].split( "/")[-1] == "jupyter-notebook" if is_jupyter: pyLDAvis.enable_notebook() except KeyError: is_jupyter = False if is_jupyter: pyLDAvis.display(panel) else: pyLDAvis.show(panel) else: if use_jupyter: pyLDAvis.enable_notebook() pyLDAvis.display(panel) elif not use_jupyter: pyLDAvis.show(panel) def print_bow(self, doc_positions): print([[(self.id2word[token_id], freq) for token_id, freq in doc] for doc in compress(self.bag_of_words, doc_positions)]) def save_model(self, path): self.lda_model.save(path) def load_model(self, path): self.lda_model = LdaMulticore.load(path) def save_dict(self, path): self.id2word.save(path) print("dict saved") def load_dict(self, path): self.id2word = corpora.Dictionary.load(path)
def main(): print("\n-----LDA CONCEPT DETECTION-----") # check command line if len(sys.argv) != 4: print(HELP_MESSAGE) quit(1) if not sys.argv[0].isdigit(): print(HELP_MESSAGE) print("<num_topics> must be numeric") if not sys.argv[1].isdigit(): print(HELP_MESSAGE) print("<num_iterations> must be numeric") if not sys.argv[2].endswith(".html"): print(HELP_MESSAGE) print("<visualization_file_path> must end with '.html'") if not sys.argv[3].endswith(".csv"): print(HELP_MESSAGE) print("<corpus_csv_file> must end with '.csv'") num_topics = sys.argv[0] num_iter = sys.argv[1] vis_file_path = sys.argv[2] corpus_csv_file = sys.argv[3] # load corpus corpus = load_from_csv(corpus_csv_file) # create CountVectorizer to get help remove short segments stop_words = load_stop_words("../../data/stopwords-fr.txt") vectorizer = CountVectorizer(lowercase=True, max_df=MAX_DF, min_df=MIN_DF, token_pattern=r"(?u)\b\w\w\w+\b") # remove short segments from the corpus proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer) proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only] # remove stop words from the corpus proc_stop_words = [] for i in range(len(proc_corpus_text_only)): proc_stop_words.append([]) for j in range(len(proc_corpus_text_only[i])): if proc_corpus_text_only[i][j] not in stop_words and len( proc_corpus_text_only[i][j]) >= 3: proc_stop_words[i].append(proc_corpus_text_only[i][j]) # vectorize text with gensim's Dictionary id2word = Dictionary(proc_stop_words) corp = [id2word.doc2bow(text) for text in proc_stop_words] # run mallet lda model path_to_mallet_binary = "Mallet/bin/mallet" mallet_model = LdaMallet(path_to_mallet_binary, corpus=corp, num_topics=13, id2word=id2word, optimize_interval=20, random_seed=4, iterations=1000) # convert to gensim model to build visualization gensim_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel( mallet_model) vis = pyLDAvis.gensim.prepare(gensim_model, corp, id2word) # save visualization pyLDAvis.save_html(vis, "sa_visualization.html") return 0
f = open("discursos_all.txt", "r") discursos_file = f.read() f.close() res = eval(discursos_file) elapsed_time = time.time() - start_time print(time.strftime("Discursos importados, demorou %H:%M:%S:%m", time.gmtime(elapsed_time))) start_time = time.time() data = [a.split() for a in res] dictionary = Dictionary(data) corpus = [dictionary.doc2bow(t) for t in data] mallet_path = 'X:\\Programs\\mallet\\mallet-2.0.8\\bin\\mallet.bat' lda = LdaMallet(mallet_path, corpus=corpus, id2word=dictionary, num_topics=500) elapsed_time = time.time() - start_time print(time.strftime("Lda model criado, demorou %H:%M:%S:%m", time.gmtime(elapsed_time))) with open("topics_500_latest.txt", 'w+') as f: for index, topic in lda.show_topics(formatted=False, num_words=15): f.write('[{}] - '.format(index)) f.write(', '.join(str(line[0]) for line in topic)) f.write('\n')
from HyperDoc2Vec import * snowball = SnowballStemmer(language='english') nlp = spacy.load('en', disable=['parser', 'ner']) nlp.Defaults.stop_words |= {'table', 'ref', 'formula', 'citation', 'cit', 'references' 'fig', 'figure', 'abstract', 'introduction', 'description','conclusion','results','discussion'} mallet_path = '/home/ashwath/mallet-2.0.8/bin/mallet' # LOAD MODELS loadmodstart = time() id2word_dictionary = corpora.Dictionary.load('/home/ashwath/Programs/ArxivCS/LDA/arxivmag.dict') corpus = corpora.MmCorpus('/home/ashwath/Programs/ArxivCS/LDA/arxivmag_bow_corpus.mm') try: ldamallet = LdaMallet.load('/home/ashwath/Programs/ArxivCS/LDA/ldamallet_arxiv.model') vec_bow_test = id2word_dictionary.doc2bow(['test']) vec_ldamallet = ldamallet[vec_bow_test] except subprocess.CalledProcessError: print("LDA MALLET COULDN'T READ INSTANCE FILE. USING NORMAL LDA INSTEAD") ldamallet = LdaModel.load('/home/ashwath/Programs/ArxivCS/LDA/lda_arxiv.model') malletindex = similarities.MatrixSimilarity.load('/home/ashwath/Programs/ArxivCS/LDA/simIndexArxiv.index') with open('/home/ashwath/Programs/ArxivCS/LDA/docid_to_magid_training_arxiv.pickle', 'rb') as pick: docid_to_magid = pickle.load(pick) hd2vmodel = HyperDoc2Vec.load('/home/ashwath/Programs/ArxivCS/hyperdoc2vec_arxivmag/models/hd2v_arxivmag.model') print("MODELS took {} seconds to load".format(time()-loadmodstart)) def remove_stopwords(context): #print("Removing stop words.")