def generate_document_like_text(prompt_text, doc_id, lda_config, generation_config):
    lda_model = LDAModel(lda_config)
    theta = lda_model.get_theta_matrix()
    psi = lda_model.get_psi_matrix()

    # get the original doc
    docs = lda_model.get_docs()
    doc = " ".join([t.strip('Ġ') for t in docs[doc_id]])
    # generation_config.max_length = len(doc.split()) # set the max_length to selected doc length

    generation_config.device = torch.device(
        "cuda" if torch.cuda.is_available() and not generation_config.no_cuda else "cpu")
    generation_config.n_gpu = torch.cuda.device_count()

    set_seed(generation_config)

    # Initialize the model and tokenizer
    try:
        generation_config.model_type = generation_config.model_type.lower()
        model_class, tokenizer_class = MODEL_CLASSES[generation_config.model_type]
    except KeyError:
        raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")

    tokenizer = tokenizer_class.from_pretrained(generation_config.model_name_or_path)
    model = model_class.from_pretrained(generation_config.model_name_or_path)
    model.to(generation_config.device)

    generation_config.max_length = adjust_length_to_model(generation_config.max_length,
                                                          max_sequence_length=model.config.max_position_embeddings)
    logger.info(generation_config)

    # Different models need different input formatting and/or extra arguments
    requires_preprocessing = generation_config.model_type in PREPROCESSING_FUNCTIONS.keys()
    if requires_preprocessing:
        prepare_input = PREPROCESSING_FUNCTIONS.get(generation_config.model_type)
        prompt_text = prepare_input(generation_config, model, tokenizer, prompt_text)
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
    encoded_prompt = encoded_prompt.to(generation_config.device)

    output_sequences = model.generate(
        input_ids=encoded_prompt,
        generation_config=generation_config,
        psi=psi,
        theta=theta,
        doc_id=doc_id,
        tokenizer=None,  # lda_model.tokenizer,
    )

    generated_sequence = output_sequences[0].tolist()
    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
    text = text[: text.find(generation_config.stop_token) if generation_config.stop_token else None]

    return text, doc
def evaluate_model(data: pd.DataFrame, num_topics: int, text_data: list,
                   dictionary: corpora.Dictionary, corpus: list):
    lda_model = LDAModel(corpus=corpus,
                         num_topics=num_topics,
                         id2word=dictionary,
                         passes=15,
                         start_date=data.published.min(),
                         end_date=data.published.max())
    perplexity = lda_model.log_perplexity(corpus)
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=text_data,
                                         dictionary=dictionary,
                                         coherence='c_v')
    coherence = coherence_model_lda.get_coherence()
    return perplexity, coherence
def train_model(data: pd.DataFrame, num_topics: int):
    text_data = data.lemmas.to_list()
    dictionary = Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    lda_model = LDAModel(corpus=corpus,
                         num_topics=num_topics,
                         id2word=dictionary,
                         passes=15,
                         start_date=data.published.min(),
                         end_date=data.published.max())
    return lda_model, corpus
def main(texts: list, num_topics=10):
    topics = list()
    lemmatized = [
        preprocess_text(raw_text=text,
                        min_word_len=4,
                        exclude_hashtags=False,
                        lemmatize=True) for text in texts
    ]
    dictionary = Dictionary(lemmatized)
    corpus = [dictionary.doc2bow(text) for text in lemmatized]
    lda_model = LDAModel(corpus=corpus,
                         num_topics=num_topics,
                         id2word=dictionary,
                         passes=15,
                         start_date=None,
                         end_date=None)
    for i in range(num_topics):
        words, weights = list(), list()
        terms = lda_model.get_topic_terms(i, topn=10)
        for word_id, weight in terms:
            words.append(lda_model.id2word[word_id])
            weights.append(weight)
        topics.append({'id': i, 'key_words': words, 'weights': weights})
    return topics
    def __init__(self, config):
        if type(config) == LDAConfig:
            self.model = LDAModel(config)
        elif type(config) == LSIConfig:
            self.model = LSIModel(config)

        # config_file = "../configs/alexa_lsi_config.json"
        # config = LSIConfig.from_json_file(config_file)
        # model = LSIModel(config=config, build=False)

        self.dictionary = self.model.get_dictionary()
        temp = self.dictionary[0]  # This is only to "load" the dictionary.
        self.cm = CoherenceModel(model=self.model.get_model(),
                                 texts=self.model.get_docs(),
                                 dictionary=self.dictionary,
                                 coherence="c_w2v")
texts = []
for i in range(100):
    generation_config.seed = i
    text, doc = generate_document_like_text(
        prompt_text="This is a",
        doc_id=doc_id,  #173,
        lda_config=config,
        generation_config=generation_config)
    texts.append(text)
    print("original: ", doc)
    print("generated: ", text)

all_text = " ".join(texts)

lda = LDAModel(config)
num_topics = sum(lda.get_theta_matrix()[doc_id, :] != 0)
###########visualize

lda_config_file = "/home/rohola/codes/topical_language_generation/configs/generated_fake_alexa_lda_config.json"

config = LDAConfig.from_json_file(lda_config_file)
config.num_topics = num_topics

##save the generate text to disk
if not os.path.isdir(config.dataset_dir):
    os.mkdir(config.dataset_dir)
with open(os.path.join(config.dataset_dir, "generated_text.txt"),
          'w') as file_writer:
    file_writer.write(all_text)
Example #7
0
import os
import sys
import shutil

from corpus_processor import get_lda_input_from_corpus_folder

from lda_model import LDAModel


def setup_temp_corpus_folder(filename):
    os.system('rm -rf ' + LDAModel.TEMP_CORPUS_FOLDER + '/*')
    shutil.copy(filename, LDAModel.TEMP_CORPUS_FOLDER)


if __name__ == '__main__':

    if 3 != len(sys.argv):
        print("\nUsage: %s category_name\n" % sys.argv[0])
        sys.exit(1)

    # ========================================
    obj = LDAModel()

    lda_input = get_lda_input_from_corpus_folder(LDAModel.TEMP_CORPUS_FOLDER)

    dic, corp, mod = obj.lda_test(lda_input)
    topics = mod.print_topics(num_words=7)
    for topic in topics:
        print(topic)
Example #8
0
tt_uv = [(u, v) for u, v, e in zip(tt.tocoo().row,
                                   tt.tocoo().col,
                                   tt.tocoo().data) for _ in range(e)]
print("done in %0.3fs." % (time() - t0))
print()

print("Preparing dgl graphs...")
t0 = time()
G = dgl.heterograph({('doc', 'topic', 'word'): tf_uv}, device=device)
Gt = dgl.heterograph({('doc', 'topic', 'word'): tt_uv}, device=device)
print("done in %0.3fs." % (time() - t0))
print()

print("Training dgl-lda model...")
t0 = time()
model = LDAModel(G, n_components)
model.fit(G)
print("done in %0.3fs." % (time() - t0))
print()

print(f"dgl-lda training perplexity {model.perplexity(G):.3f}")
print(f"dgl-lda testing perplexity {model.perplexity(Gt):.3f}")

plot_top_words(
    type('dummy', (object, ),
         {'components_': G.ndata['z']['word'].cpu().numpy().T}),
    tf_feature_names, n_top_words, 'Topics in LDA model')

print("Training scikit-learn model...")

print(
from collections import namedtuple
from dataclasses import dataclass
from visualization.topic_modeling_semantic_network import visualize_semantic_netwrok
from lda_model import LDAModel

#config_file = "configs/alexa_lda_config.json"
#config_file = "configs/nytimes_lda_config.json"
#config_file = "configs/anes_lda_config.json"
config_file = "configs/congress_lda_config.json"

lda = LDAModel(config_file=config_file)

lda._start()

all_topic_tokens = lda.get_all_topic_tokens(num_words=15)

#clean up words
topic_words = [[(t[0].strip('Ġ'), t[1]) for t in tw]
               for tw in all_topic_tokens]
for topic in topic_words:
    print(topic)


#todo remove dataclass and replace it with VisualizationConfig class
@dataclass
class config:
    dimension: int = 2
    threshold: float = 0.00001
    node_size: float = 30
    color_scale: str = "Viridis"
    title: str = "LDA"
Example #10
0
from corpus_processor import get_lda_input_from_corpus_folder
from lda_model import LDAModel
from pprint import pprint
from collections import Counter
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel

if __name__ == '__main__':
    #limit=40
    #start=2
    #step=2
    obj = LDAModel()
    lda_input = get_lda_input_from_corpus_folder('./dataset/TRAIN')
    output = obj.extract_words(lda_input)
    lm, top_topics = obj.lda_train(lda_input)
    print(top_topics[:5])
    #print("show topics",lm.show_topics(formatted=False))
    pprint(
        [lm.show_topic(topicid, topn=12) for topicid, c_v in top_topics[:8]])

    #lda_lsi_topics = [[word for word, prob in lm.show_topic(topicid)] for topicid, c_v in top_topics]
# print ("topic of lda_lsi", lda_lsi_topics)

# model_list, coherence_values = obj.compute_coherence_values(dictionary=dic,corpus=corpus, texts=lda_input,  start=2, limit=40, step=2)
# x = range(start, limit, step)
#for m, cv in zip(x, coherence_values):
#print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
#words = obj.extract_words(lda_input)
#print(lda_input)

#dic, corp, mod = obj.lda_train(lda_input)
#print(top_topics[:5])
Example #11
0
tt_uv = [(u, v) for u, v, e in zip(tt.tocoo().row,
                                   tt.tocoo().col,
                                   tt.tocoo().data) for _ in range(e)]
print("done in %0.3fs." % (time() - t0))
print()

print("Preparing dgl graphs...")
t0 = time()
G = dgl.heterograph({('doc', 'topic', 'word'): tf_uv}, device=device)
Gt = dgl.heterograph({('doc', 'topic', 'word'): tt_uv}, device=device)
print("done in %0.3fs." % (time() - t0))
print()

print("Training dgl-lda model...")
t0 = time()
model = LDAModel(G.num_nodes('word'), n_components)
model.fit(G)
print("done in %0.3fs." % (time() - t0))
print()

print(f"dgl-lda training perplexity {model.perplexity(G):.3f}")
print(f"dgl-lda testing perplexity {model.perplexity(Gt):.3f}")

word_nphi = np.vstack([nphi.tolist() for nphi in model.word_data.nphi])
plot_top_words(type('dummy', (object, ), {'components_': word_nphi}),
               tf_feature_names, n_top_words, 'Topics in LDA model')

print("Training scikit-learn model...")

print(
    '\n' * 2, "Fitting LDA models with tf features, "
Example #12
0
def similarity_measure(config, generation_config, num_docs):
    text_similarity = TextSimilarity()

    nnlm_tlg_similarities = []
    nnlm_gpt_similarities = []

    # gpt_text = generate_unconditional_text(prompt_text="This is a",
    #                                        generation_config=generation_config)

    lda_model = LDAModel(config)
    docs = lda_model.get_docs()

    ###############################
    generation_config.n_gpu = torch.cuda.device_count()
    generation_config.device = torch.device("cuda" if torch.cuda.is_available(
    ) and not generation_config.no_cuda else "cpu")

    generation_config.device = torch.device("cuda" if torch.cuda.is_available(
    ) and not generation_config.no_cuda else "cpu")
    generation_config.n_gpu = torch.cuda.device_count()

    set_seed(generation_config)

    # Initialize the model and tokenizer
    try:
        generation_config.model_type = generation_config.model_type.lower()
        model_class, tokenizer_class = MODEL_CLASSES[
            generation_config.model_type]
    except KeyError:
        raise KeyError(
            "the model {} you specified is not supported. You are welcome to add it and open a PR :)"
        )

    tokenizer = tokenizer_class.from_pretrained(
        generation_config.model_name_or_path)
    model = model_class.from_pretrained(generation_config.model_name_or_path)
    model.to(generation_config.device)

    ###############################

    for doc_id in range(num_docs):
        if doc_id > 100:
            break

        doc = " ".join([t.strip('Ġ') for t in docs[doc_id]])

        generation_config.max_length = 2 * len(doc.split())

        prompt_text = doc
        #########################################
        generation_config.max_length = adjust_length_to_model(
            generation_config.max_length,
            max_sequence_length=model.config.max_position_embeddings)
        # Different models need different input formatting and/or extra arguments
        requires_preprocessing = generation_config.model_type in PREPROCESSING_FUNCTIONS.keys(
        )
        if requires_preprocessing:
            prepare_input = PREPROCESSING_FUNCTIONS.get(
                generation_config.model_type)
            prompt_text = prepare_input(generation_config, model, tokenizer,
                                        prompt_text)
        encoded_prompt = tokenizer.encode(prompt_text,
                                          add_special_tokens=False,
                                          return_tensors="pt")
        encoded_prompt = encoded_prompt.to(generation_config.device)

        output_sequences = model.generate(
            input_ids=encoded_prompt,
            generation_config=generation_config,
        )

        # Batch size == 1. to add more examples please use num_return_sequences > 1
        generated_sequence = output_sequences[0].tolist()
        text = tokenizer.decode(generated_sequence,
                                clean_up_tokenization_spaces=True)
        text = text[:text.find(generation_config.stop_token
                               ) if generation_config.stop_token else None]

        ##########################################

        gpt_text = text

        # gpt_text = generate_unconditional_text(prompt_text=doc,
        #                                        generation_config=generation_config)

        gpt_text = gpt_text.split()[len(doc.split()):]  # remove the propmt
        gpt_text = " ".join(gpt_text)

        # tlg_text, doc = generate_document_like_text(prompt_text="This is a", # we also can change this
        #                                             doc_id=doc_id,
        #                                             lda_config=config,
        #                                             generation_config=generation_config)
        #
        # nnlm_tlg_similarities.append(text_similarity.nnlm_sentence_similarity(tlg_text, doc))
        nnlm_gpt_similarities.append(
            text_similarity.nnlm_sentence_similarity(gpt_text, doc))

    #print("nnlm_tlg_similarities", np.mean(nnlm_tlg_similarities), np.std(nnlm_tlg_similarities))
    print("nnlm_gpt_similarities", np.mean(nnlm_gpt_similarities),
          np.std(nnlm_gpt_similarities))
Example #13
0
        gpt_text = gpt_text.split()[len(doc.split()):]  # remove the propmt
        gpt_text = " ".join(gpt_text)

        # tlg_text, doc = generate_document_like_text(prompt_text="This is a", # we also can change this
        #                                             doc_id=doc_id,
        #                                             lda_config=config,
        #                                             generation_config=generation_config)
        #
        # nnlm_tlg_similarities.append(text_similarity.nnlm_sentence_similarity(tlg_text, doc))
        nnlm_gpt_similarities.append(
            text_similarity.nnlm_sentence_similarity(gpt_text, doc))

    #print("nnlm_tlg_similarities", np.mean(nnlm_tlg_similarities), np.std(nnlm_tlg_similarities))
    print("nnlm_gpt_similarities", np.mean(nnlm_gpt_similarities),
          np.std(nnlm_gpt_similarities))


if __name__ == "__main__":
    lda_config_file = "/home/rohola/codes/topical_language_generation/configs/alexa_lda_config.json"
    generation_config_file = "/home/rohola/codes/topical_language_generation/configs/generation_config.json"

    config = LDAConfig.from_json_file(lda_config_file)
    generation_config = GenerationConfig.from_json_file(generation_config_file)

    lda_model = LDAModel(config, False)
    theta = lda_model.get_theta_matrix()
    num_docs = theta.shape[0]
    print(num_docs)
    similarity_measure(config, generation_config, num_docs)
def generate_lda_text(prompt_text, selected_topic_index, lda_config, generation_config, plot=False):
    generation_config.device = torch.device(
        "cuda" if torch.cuda.is_available() and not generation_config.no_cuda else "cpu")
    generation_config.n_gpu = torch.cuda.device_count()

    set_seed(generation_config)

    # Initialize the model and tokenizer
    try:
        generation_config.model_type = generation_config.model_type.lower()
        model_class, tokenizer_class = MODEL_CLASSES[generation_config.model_type]
    except KeyError:
        raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")

    tokenizer = tokenizer_class.from_pretrained(generation_config.model_name_or_path)
    model = model_class.from_pretrained(generation_config.model_name_or_path)
    model.to(generation_config.device)

    # generation_config.max_length = adjust_length_to_model(generation_config.max_length,
    #                                                       max_sequence_length=model.config.max_position_embeddings)
    logger.info(generation_config)

    # Different models need different input formatting and/or extra arguments
    requires_preprocessing = generation_config.model_type in PREPROCESSING_FUNCTIONS.keys()
    if requires_preprocessing:
        prepare_input = PREPROCESSING_FUNCTIONS.get(generation_config.model_type)
        prompt_text = prepare_input(generation_config, model, tokenizer, prompt_text)
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
    encoded_prompt = encoded_prompt.to(generation_config.device)

    lda_model = LDAModel(lda_config)
    # theta = lda_model.get_theta_matrix()
    psi = lda_model.get_psi_matrix()
    theta = None

    output_sequences, total_entropies, token_entropies, kl_divergences, token_weights, all_top_words = model.generate(
        input_ids=encoded_prompt,
        generation_config=generation_config,
        selected_topic_index=selected_topic_index,
        psi=psi,
        theta=theta,
        tokenizer=None#lda_model.tokenizer,
    )

    generated_sequence = output_sequences[0].tolist()
    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
    text = text[: text.find(generation_config.stop_token) if generation_config.stop_token else None]

    tokens = [lda_model.tokenizer.tokenizer.convert_ids_to_tokens(i).strip('Ġ') for i in generated_sequence]
    if plot:
        show_prompt = False
        if show_prompt:
            prompt_padding = [0] * len(encoded_prompt[0])
            total_entropies = prompt_padding + total_entropies
            token_entropies = prompt_padding + token_entropies
            kl_divergences = prompt_padding + kl_divergences
        else:
            tokens = tokens[len(encoded_prompt[0]):]


        # barchart(tokens, total_entropies)
        multi_barchart(tokens, total_entropies, token_entropies, names=["Total Entropy",
                                                                        "Token Entropy"])
        barchart(tokens, kl_divergences)

        top_words_prob_plot(all_top_words)


    return text, tokens, token_weights
def main():
    config_file = "configs/generation_topical_config.json"
    config = GenerationConfig.from_json_file(config_file)

    config.n_gpu = torch.cuda.device_count()
    config.device = torch.device("cuda" if torch.cuda.is_available() and not config.no_cuda else "cpu")

    config.device = torch.device("cuda" if torch.cuda.is_available() and not config.no_cuda else "cpu")
    config.n_gpu = torch.cuda.device_count()

    set_seed(config)

    # Initialize the model and tokenizer
    try:
        config.model_type = config.model_type.lower()
        model_class, tokenizer_class = MODEL_CLASSES[config.model_type]
    except KeyError:
        raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")

    tokenizer = tokenizer_class.from_pretrained(config.model_name_or_path)
    model = model_class.from_pretrained(config.model_name_or_path)
    model.to(config.device)

    config.length = adjust_length_to_model(config.length, max_sequence_length=model.config.max_position_embeddings)
    logger.info(config)

    prompt_text = input("Model prompt >>> ")

    # Different models need different input formatting and/or extra arguments
    requires_preprocessing = config.model_type in PREPROCESSING_FUNCTIONS.keys()
    if requires_preprocessing:
        prepare_input = PREPROCESSING_FUNCTIONS.get(config.model_type)
        prompt_text = prepare_input(config, model, tokenizer, prompt_text)
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
    encoded_prompt = encoded_prompt.to(config.device)

    topical_model = "lsi"  # "lda"
    if topical_model == "lda":
        lda_config_file = "configs/alexa_lda_config.json"
        lda_model = LDAModel(lda_config_file)
        theta = lda_model.get_theta_matrix()
        psi = lda_model.get_psi_matrix()
        # theta=None

        output_sequences = model.generate(
            input_ids=encoded_prompt,
            psi=psi,
            theta=theta,
            tokenizer=lda_model.tokenizer,
            max_length=config.length,
            temperature=config.temperature,
            top_k=config.top_k,
            top_p=config.top_p,
            repetition_penalty=config.repetition_penalty,
        )

    elif topical_model == "lsi":
        lsi_config_file = "configs/congress_lsi_config.json"
        lsi_model = LSIModel(lsi_config_file)
        topic_word_matrix = lsi_model.get_topic_words_matrix()

        output_sequences = model.generate(
            input_ids=encoded_prompt,
            topic_word_matrix=topic_word_matrix,
            tokenizer=lsi_model.tokenizer,
            max_length=config.length,
            temperature=config.temperature,
            top_k=config.top_k,
            top_p=config.top_p,
            repetition_penalty=config.repetition_penalty,
        )

    # Batch size == 1. to add more examples please use num_return_sequences > 1
    generated_sequence = output_sequences[0].tolist()
    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
    text = text[: text.find(config.stop_token) if config.stop_token else None]

    print(text)

    return text
Example #16
0
    st.sidebar.text_input("Repetition Penalty: ", 1.2))
session_state.length = int(st.sidebar.text_input("Generated Text Length: ",
                                                 50))

session_state.seed_number = int(st.sidebar.text_input("Seed Number: ", 41))

topic_words = []

if st.button("Plot Topics"):
    with st.spinner('Please be patient ...'):
        if session_state.topic_model == "lda":
            session_state.config = get_draft_config(session_state.topic_model,
                                                    session_state.dataset)
            session_state.config.num_topics = session_state.num_topics
            session_state.config.alpha = session_state.alpha
            lda = LDAModel(session_state.config, build=True)
            all_topic_tokens = lda.get_all_topic_tokens(num_words=15)

            a = lda.get_psi_matrix()
            print("first time", a.max())

            # clean up words
            session_state.topic_words = [[(t[0].strip('Ġ'), t[1]) for t in tw]
                                         for tw in all_topic_tokens]
            plot_config = PlotConfig.from_json_file(
                "configs/lda_plot_config.json")

            fig = visualize_semantic_netwrok(plot_config,
                                             session_state.topic_words,
                                             auto_open=False)
            st.plotly_chart(fig)