def generate_document_like_text(prompt_text, doc_id, lda_config, generation_config): lda_model = LDAModel(lda_config) theta = lda_model.get_theta_matrix() psi = lda_model.get_psi_matrix() # get the original doc docs = lda_model.get_docs() doc = " ".join([t.strip('Ġ') for t in docs[doc_id]]) # generation_config.max_length = len(doc.split()) # set the max_length to selected doc length generation_config.device = torch.device( "cuda" if torch.cuda.is_available() and not generation_config.no_cuda else "cpu") generation_config.n_gpu = torch.cuda.device_count() set_seed(generation_config) # Initialize the model and tokenizer try: generation_config.model_type = generation_config.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[generation_config.model_type] except KeyError: raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)") tokenizer = tokenizer_class.from_pretrained(generation_config.model_name_or_path) model = model_class.from_pretrained(generation_config.model_name_or_path) model.to(generation_config.device) generation_config.max_length = adjust_length_to_model(generation_config.max_length, max_sequence_length=model.config.max_position_embeddings) logger.info(generation_config) # Different models need different input formatting and/or extra arguments requires_preprocessing = generation_config.model_type in PREPROCESSING_FUNCTIONS.keys() if requires_preprocessing: prepare_input = PREPROCESSING_FUNCTIONS.get(generation_config.model_type) prompt_text = prepare_input(generation_config, model, tokenizer, prompt_text) encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt") encoded_prompt = encoded_prompt.to(generation_config.device) output_sequences = model.generate( input_ids=encoded_prompt, generation_config=generation_config, psi=psi, theta=theta, doc_id=doc_id, tokenizer=None, # lda_model.tokenizer, ) generated_sequence = output_sequences[0].tolist() text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) text = text[: text.find(generation_config.stop_token) if generation_config.stop_token else None] return text, doc
def evaluate_model(data: pd.DataFrame, num_topics: int, text_data: list, dictionary: corpora.Dictionary, corpus: list): lda_model = LDAModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=15, start_date=data.published.min(), end_date=data.published.max()) perplexity = lda_model.log_perplexity(corpus) coherence_model_lda = CoherenceModel(model=lda_model, texts=text_data, dictionary=dictionary, coherence='c_v') coherence = coherence_model_lda.get_coherence() return perplexity, coherence
def train_model(data: pd.DataFrame, num_topics: int): text_data = data.lemmas.to_list() dictionary = Dictionary(text_data) corpus = [dictionary.doc2bow(text) for text in text_data] lda_model = LDAModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=15, start_date=data.published.min(), end_date=data.published.max()) return lda_model, corpus
def main(texts: list, num_topics=10): topics = list() lemmatized = [ preprocess_text(raw_text=text, min_word_len=4, exclude_hashtags=False, lemmatize=True) for text in texts ] dictionary = Dictionary(lemmatized) corpus = [dictionary.doc2bow(text) for text in lemmatized] lda_model = LDAModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=15, start_date=None, end_date=None) for i in range(num_topics): words, weights = list(), list() terms = lda_model.get_topic_terms(i, topn=10) for word_id, weight in terms: words.append(lda_model.id2word[word_id]) weights.append(weight) topics.append({'id': i, 'key_words': words, 'weights': weights}) return topics
def __init__(self, config): if type(config) == LDAConfig: self.model = LDAModel(config) elif type(config) == LSIConfig: self.model = LSIModel(config) # config_file = "../configs/alexa_lsi_config.json" # config = LSIConfig.from_json_file(config_file) # model = LSIModel(config=config, build=False) self.dictionary = self.model.get_dictionary() temp = self.dictionary[0] # This is only to "load" the dictionary. self.cm = CoherenceModel(model=self.model.get_model(), texts=self.model.get_docs(), dictionary=self.dictionary, coherence="c_w2v")
texts = [] for i in range(100): generation_config.seed = i text, doc = generate_document_like_text( prompt_text="This is a", doc_id=doc_id, #173, lda_config=config, generation_config=generation_config) texts.append(text) print("original: ", doc) print("generated: ", text) all_text = " ".join(texts) lda = LDAModel(config) num_topics = sum(lda.get_theta_matrix()[doc_id, :] != 0) ###########visualize lda_config_file = "/home/rohola/codes/topical_language_generation/configs/generated_fake_alexa_lda_config.json" config = LDAConfig.from_json_file(lda_config_file) config.num_topics = num_topics ##save the generate text to disk if not os.path.isdir(config.dataset_dir): os.mkdir(config.dataset_dir) with open(os.path.join(config.dataset_dir, "generated_text.txt"), 'w') as file_writer: file_writer.write(all_text)
import os import sys import shutil from corpus_processor import get_lda_input_from_corpus_folder from lda_model import LDAModel def setup_temp_corpus_folder(filename): os.system('rm -rf ' + LDAModel.TEMP_CORPUS_FOLDER + '/*') shutil.copy(filename, LDAModel.TEMP_CORPUS_FOLDER) if __name__ == '__main__': if 3 != len(sys.argv): print("\nUsage: %s category_name\n" % sys.argv[0]) sys.exit(1) # ======================================== obj = LDAModel() lda_input = get_lda_input_from_corpus_folder(LDAModel.TEMP_CORPUS_FOLDER) dic, corp, mod = obj.lda_test(lda_input) topics = mod.print_topics(num_words=7) for topic in topics: print(topic)
tt_uv = [(u, v) for u, v, e in zip(tt.tocoo().row, tt.tocoo().col, tt.tocoo().data) for _ in range(e)] print("done in %0.3fs." % (time() - t0)) print() print("Preparing dgl graphs...") t0 = time() G = dgl.heterograph({('doc', 'topic', 'word'): tf_uv}, device=device) Gt = dgl.heterograph({('doc', 'topic', 'word'): tt_uv}, device=device) print("done in %0.3fs." % (time() - t0)) print() print("Training dgl-lda model...") t0 = time() model = LDAModel(G, n_components) model.fit(G) print("done in %0.3fs." % (time() - t0)) print() print(f"dgl-lda training perplexity {model.perplexity(G):.3f}") print(f"dgl-lda testing perplexity {model.perplexity(Gt):.3f}") plot_top_words( type('dummy', (object, ), {'components_': G.ndata['z']['word'].cpu().numpy().T}), tf_feature_names, n_top_words, 'Topics in LDA model') print("Training scikit-learn model...") print(
from collections import namedtuple from dataclasses import dataclass from visualization.topic_modeling_semantic_network import visualize_semantic_netwrok from lda_model import LDAModel #config_file = "configs/alexa_lda_config.json" #config_file = "configs/nytimes_lda_config.json" #config_file = "configs/anes_lda_config.json" config_file = "configs/congress_lda_config.json" lda = LDAModel(config_file=config_file) lda._start() all_topic_tokens = lda.get_all_topic_tokens(num_words=15) #clean up words topic_words = [[(t[0].strip('Ġ'), t[1]) for t in tw] for tw in all_topic_tokens] for topic in topic_words: print(topic) #todo remove dataclass and replace it with VisualizationConfig class @dataclass class config: dimension: int = 2 threshold: float = 0.00001 node_size: float = 30 color_scale: str = "Viridis" title: str = "LDA"
from corpus_processor import get_lda_input_from_corpus_folder from lda_model import LDAModel from pprint import pprint from collections import Counter from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel if __name__ == '__main__': #limit=40 #start=2 #step=2 obj = LDAModel() lda_input = get_lda_input_from_corpus_folder('./dataset/TRAIN') output = obj.extract_words(lda_input) lm, top_topics = obj.lda_train(lda_input) print(top_topics[:5]) #print("show topics",lm.show_topics(formatted=False)) pprint( [lm.show_topic(topicid, topn=12) for topicid, c_v in top_topics[:8]]) #lda_lsi_topics = [[word for word, prob in lm.show_topic(topicid)] for topicid, c_v in top_topics] # print ("topic of lda_lsi", lda_lsi_topics) # model_list, coherence_values = obj.compute_coherence_values(dictionary=dic,corpus=corpus, texts=lda_input, start=2, limit=40, step=2) # x = range(start, limit, step) #for m, cv in zip(x, coherence_values): #print("Num Topics =", m, " has Coherence Value of", round(cv, 4)) #words = obj.extract_words(lda_input) #print(lda_input) #dic, corp, mod = obj.lda_train(lda_input) #print(top_topics[:5])
tt_uv = [(u, v) for u, v, e in zip(tt.tocoo().row, tt.tocoo().col, tt.tocoo().data) for _ in range(e)] print("done in %0.3fs." % (time() - t0)) print() print("Preparing dgl graphs...") t0 = time() G = dgl.heterograph({('doc', 'topic', 'word'): tf_uv}, device=device) Gt = dgl.heterograph({('doc', 'topic', 'word'): tt_uv}, device=device) print("done in %0.3fs." % (time() - t0)) print() print("Training dgl-lda model...") t0 = time() model = LDAModel(G.num_nodes('word'), n_components) model.fit(G) print("done in %0.3fs." % (time() - t0)) print() print(f"dgl-lda training perplexity {model.perplexity(G):.3f}") print(f"dgl-lda testing perplexity {model.perplexity(Gt):.3f}") word_nphi = np.vstack([nphi.tolist() for nphi in model.word_data.nphi]) plot_top_words(type('dummy', (object, ), {'components_': word_nphi}), tf_feature_names, n_top_words, 'Topics in LDA model') print("Training scikit-learn model...") print( '\n' * 2, "Fitting LDA models with tf features, "
def similarity_measure(config, generation_config, num_docs): text_similarity = TextSimilarity() nnlm_tlg_similarities = [] nnlm_gpt_similarities = [] # gpt_text = generate_unconditional_text(prompt_text="This is a", # generation_config=generation_config) lda_model = LDAModel(config) docs = lda_model.get_docs() ############################### generation_config.n_gpu = torch.cuda.device_count() generation_config.device = torch.device("cuda" if torch.cuda.is_available( ) and not generation_config.no_cuda else "cpu") generation_config.device = torch.device("cuda" if torch.cuda.is_available( ) and not generation_config.no_cuda else "cpu") generation_config.n_gpu = torch.cuda.device_count() set_seed(generation_config) # Initialize the model and tokenizer try: generation_config.model_type = generation_config.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[ generation_config.model_type] except KeyError: raise KeyError( "the model {} you specified is not supported. You are welcome to add it and open a PR :)" ) tokenizer = tokenizer_class.from_pretrained( generation_config.model_name_or_path) model = model_class.from_pretrained(generation_config.model_name_or_path) model.to(generation_config.device) ############################### for doc_id in range(num_docs): if doc_id > 100: break doc = " ".join([t.strip('Ġ') for t in docs[doc_id]]) generation_config.max_length = 2 * len(doc.split()) prompt_text = doc ######################################### generation_config.max_length = adjust_length_to_model( generation_config.max_length, max_sequence_length=model.config.max_position_embeddings) # Different models need different input formatting and/or extra arguments requires_preprocessing = generation_config.model_type in PREPROCESSING_FUNCTIONS.keys( ) if requires_preprocessing: prepare_input = PREPROCESSING_FUNCTIONS.get( generation_config.model_type) prompt_text = prepare_input(generation_config, model, tokenizer, prompt_text) encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt") encoded_prompt = encoded_prompt.to(generation_config.device) output_sequences = model.generate( input_ids=encoded_prompt, generation_config=generation_config, ) # Batch size == 1. to add more examples please use num_return_sequences > 1 generated_sequence = output_sequences[0].tolist() text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) text = text[:text.find(generation_config.stop_token ) if generation_config.stop_token else None] ########################################## gpt_text = text # gpt_text = generate_unconditional_text(prompt_text=doc, # generation_config=generation_config) gpt_text = gpt_text.split()[len(doc.split()):] # remove the propmt gpt_text = " ".join(gpt_text) # tlg_text, doc = generate_document_like_text(prompt_text="This is a", # we also can change this # doc_id=doc_id, # lda_config=config, # generation_config=generation_config) # # nnlm_tlg_similarities.append(text_similarity.nnlm_sentence_similarity(tlg_text, doc)) nnlm_gpt_similarities.append( text_similarity.nnlm_sentence_similarity(gpt_text, doc)) #print("nnlm_tlg_similarities", np.mean(nnlm_tlg_similarities), np.std(nnlm_tlg_similarities)) print("nnlm_gpt_similarities", np.mean(nnlm_gpt_similarities), np.std(nnlm_gpt_similarities))
gpt_text = gpt_text.split()[len(doc.split()):] # remove the propmt gpt_text = " ".join(gpt_text) # tlg_text, doc = generate_document_like_text(prompt_text="This is a", # we also can change this # doc_id=doc_id, # lda_config=config, # generation_config=generation_config) # # nnlm_tlg_similarities.append(text_similarity.nnlm_sentence_similarity(tlg_text, doc)) nnlm_gpt_similarities.append( text_similarity.nnlm_sentence_similarity(gpt_text, doc)) #print("nnlm_tlg_similarities", np.mean(nnlm_tlg_similarities), np.std(nnlm_tlg_similarities)) print("nnlm_gpt_similarities", np.mean(nnlm_gpt_similarities), np.std(nnlm_gpt_similarities)) if __name__ == "__main__": lda_config_file = "/home/rohola/codes/topical_language_generation/configs/alexa_lda_config.json" generation_config_file = "/home/rohola/codes/topical_language_generation/configs/generation_config.json" config = LDAConfig.from_json_file(lda_config_file) generation_config = GenerationConfig.from_json_file(generation_config_file) lda_model = LDAModel(config, False) theta = lda_model.get_theta_matrix() num_docs = theta.shape[0] print(num_docs) similarity_measure(config, generation_config, num_docs)
def generate_lda_text(prompt_text, selected_topic_index, lda_config, generation_config, plot=False): generation_config.device = torch.device( "cuda" if torch.cuda.is_available() and not generation_config.no_cuda else "cpu") generation_config.n_gpu = torch.cuda.device_count() set_seed(generation_config) # Initialize the model and tokenizer try: generation_config.model_type = generation_config.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[generation_config.model_type] except KeyError: raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)") tokenizer = tokenizer_class.from_pretrained(generation_config.model_name_or_path) model = model_class.from_pretrained(generation_config.model_name_or_path) model.to(generation_config.device) # generation_config.max_length = adjust_length_to_model(generation_config.max_length, # max_sequence_length=model.config.max_position_embeddings) logger.info(generation_config) # Different models need different input formatting and/or extra arguments requires_preprocessing = generation_config.model_type in PREPROCESSING_FUNCTIONS.keys() if requires_preprocessing: prepare_input = PREPROCESSING_FUNCTIONS.get(generation_config.model_type) prompt_text = prepare_input(generation_config, model, tokenizer, prompt_text) encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt") encoded_prompt = encoded_prompt.to(generation_config.device) lda_model = LDAModel(lda_config) # theta = lda_model.get_theta_matrix() psi = lda_model.get_psi_matrix() theta = None output_sequences, total_entropies, token_entropies, kl_divergences, token_weights, all_top_words = model.generate( input_ids=encoded_prompt, generation_config=generation_config, selected_topic_index=selected_topic_index, psi=psi, theta=theta, tokenizer=None#lda_model.tokenizer, ) generated_sequence = output_sequences[0].tolist() text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) text = text[: text.find(generation_config.stop_token) if generation_config.stop_token else None] tokens = [lda_model.tokenizer.tokenizer.convert_ids_to_tokens(i).strip('Ġ') for i in generated_sequence] if plot: show_prompt = False if show_prompt: prompt_padding = [0] * len(encoded_prompt[0]) total_entropies = prompt_padding + total_entropies token_entropies = prompt_padding + token_entropies kl_divergences = prompt_padding + kl_divergences else: tokens = tokens[len(encoded_prompt[0]):] # barchart(tokens, total_entropies) multi_barchart(tokens, total_entropies, token_entropies, names=["Total Entropy", "Token Entropy"]) barchart(tokens, kl_divergences) top_words_prob_plot(all_top_words) return text, tokens, token_weights
def main(): config_file = "configs/generation_topical_config.json" config = GenerationConfig.from_json_file(config_file) config.n_gpu = torch.cuda.device_count() config.device = torch.device("cuda" if torch.cuda.is_available() and not config.no_cuda else "cpu") config.device = torch.device("cuda" if torch.cuda.is_available() and not config.no_cuda else "cpu") config.n_gpu = torch.cuda.device_count() set_seed(config) # Initialize the model and tokenizer try: config.model_type = config.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[config.model_type] except KeyError: raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)") tokenizer = tokenizer_class.from_pretrained(config.model_name_or_path) model = model_class.from_pretrained(config.model_name_or_path) model.to(config.device) config.length = adjust_length_to_model(config.length, max_sequence_length=model.config.max_position_embeddings) logger.info(config) prompt_text = input("Model prompt >>> ") # Different models need different input formatting and/or extra arguments requires_preprocessing = config.model_type in PREPROCESSING_FUNCTIONS.keys() if requires_preprocessing: prepare_input = PREPROCESSING_FUNCTIONS.get(config.model_type) prompt_text = prepare_input(config, model, tokenizer, prompt_text) encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt") encoded_prompt = encoded_prompt.to(config.device) topical_model = "lsi" # "lda" if topical_model == "lda": lda_config_file = "configs/alexa_lda_config.json" lda_model = LDAModel(lda_config_file) theta = lda_model.get_theta_matrix() psi = lda_model.get_psi_matrix() # theta=None output_sequences = model.generate( input_ids=encoded_prompt, psi=psi, theta=theta, tokenizer=lda_model.tokenizer, max_length=config.length, temperature=config.temperature, top_k=config.top_k, top_p=config.top_p, repetition_penalty=config.repetition_penalty, ) elif topical_model == "lsi": lsi_config_file = "configs/congress_lsi_config.json" lsi_model = LSIModel(lsi_config_file) topic_word_matrix = lsi_model.get_topic_words_matrix() output_sequences = model.generate( input_ids=encoded_prompt, topic_word_matrix=topic_word_matrix, tokenizer=lsi_model.tokenizer, max_length=config.length, temperature=config.temperature, top_k=config.top_k, top_p=config.top_p, repetition_penalty=config.repetition_penalty, ) # Batch size == 1. to add more examples please use num_return_sequences > 1 generated_sequence = output_sequences[0].tolist() text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) text = text[: text.find(config.stop_token) if config.stop_token else None] print(text) return text
st.sidebar.text_input("Repetition Penalty: ", 1.2)) session_state.length = int(st.sidebar.text_input("Generated Text Length: ", 50)) session_state.seed_number = int(st.sidebar.text_input("Seed Number: ", 41)) topic_words = [] if st.button("Plot Topics"): with st.spinner('Please be patient ...'): if session_state.topic_model == "lda": session_state.config = get_draft_config(session_state.topic_model, session_state.dataset) session_state.config.num_topics = session_state.num_topics session_state.config.alpha = session_state.alpha lda = LDAModel(session_state.config, build=True) all_topic_tokens = lda.get_all_topic_tokens(num_words=15) a = lda.get_psi_matrix() print("first time", a.max()) # clean up words session_state.topic_words = [[(t[0].strip('Ġ'), t[1]) for t in tw] for tw in all_topic_tokens] plot_config = PlotConfig.from_json_file( "configs/lda_plot_config.json") fig = visualize_semantic_netwrok(plot_config, session_state.topic_words, auto_open=False) st.plotly_chart(fig)