def displayTopics(): dictionary = corpora.Dictionary.load(dictionary_path) corpus = corpora.BleiCorpus(corpus_path) lda = LdaMulticore.load(lda_model_path) i = 0 for topic in lda.show_topics(lda_num_topics): print 'Topic #' + str(i) + ': ' + str(topic) i += 1
def ldaplot(): model = LdaMulticore.load('lda.model') with open('bow_dict.pk', 'rb') as f: bow, dict = pickle.load(f) # pdb.set_trace() vis = pyLDAvis.gensim.prepare(model, bow, dict) # pyLDAvis.display(vis) # pdb.set_trace() pyLDAvis.save_html(vis, 'lda.html')
def __init__(self, lda_path, dict_path): """ lda_path - path to lda model dict_path - path to dict param: lda_path str param: dict_path str """ self.dictionary = corpora.Dictionary.load(dict_path) self.lda = LdaMulticore.load(lda_path)
def run(self): (model_file, dictionary_file), data_file = self.input() model = GensimLdaModel.load(model_file.path) dictionary = corpora.Dictionary.load(dictionary_file.path) data = pd.read_pickle(data_file.path) features = data.apply(self.get_features, model=model, dictionary=dictionary, axis=1) features.to_pickle(self.output().path)
def eval(self, dataset): path = '../models.nosync/lda/model' model = LdaMulticore.load(path) corpus, dictionary = self._prepare(dataset) x = model.log_perplexity(corpus) print(x) for i, (d, t) in enumerate(zip(corpus, dataset.titles)): print(t) for j, s in model.get_document_topics(d): print(dictionary.id2token[j], end=' ') print('\n')
def getLDA(topics): corp_d=Dictionary.load(MODEL_DIR+"corpus_dict.model") lda=LdaMulticore.load(MODEL_DIR+"lda.model") pp_docs=[] for topic in topics: pp_docs.append(lemma_pp(topic)) dtm=[corp_d.doc2bow(doc) for doc in pp_docs] tfidf=TfidfModel(dtm) corp_tfidf=tfidf[dtm] return list(lda[tfidf[dtm]])
def create_dist_matrix (model_a_dest, model_b_dest, distance='jaccard', num_words=300, normed=True): a = LdaMulticore.load(model_a_dest) b = LdaMulticore.load(model_b_dest) mdiff_a_b, annotation_a_b = a.diff(b, distance=distance, num_words=num_words, normed=normed) mdiff_b_a, annotation_b_a = b.diff(a, distance=distance, num_words=num_words, normed=normed) #topic_diff_a_b = np.ones(mdiff_a_b.shape) - mdiff_a_b #topic_diff_b_a = np.ones(mdiff_a_b.shape) - mdiff_b_a topic_diff_a_b = mdiff_a_b topic_diff_b_a = mdiff_b_a a_ones = np.ones(topic_diff_a_b.shape) b_ones = np.ones(topic_diff_a_b.shape) first_half = np.concatenate((a_ones, topic_diff_b_a), axis=0) second_half = np.concatenate((topic_diff_a_b, b_ones), axis=0) total = np.concatenate((first_half, second_half), axis=1) return total
def main(): lda_model = LdaMulticore.load('lda.model') print('Successfully Loaded') print(lda_model) f = open('cnn_text.pickle', 'r') test_data1 = pickle.load(f) f = open('test_dataset.txt', 'r') test_data2 = pickle.load(f)
def load(self, fname, return_dense=True, max_df=0.5, min_df=5, *args, **kwargs): lda = LdaMulticore.load(fname, *args, **kwargs) lda = LdaMulticore() alpha = lda.alpha eta = lda.eta iterations = lda.iterations random_seed = lda.random_state workers = lda.workers num_topics = lda.num_topics return GensimLDAVectorizer(num_topics, alpha, eta, workers, iterations, return_dense, max_df, min_df, random_seed)
def visual_lda(): lda = LdaMulticore.load("../model/lda.model") with open("../result/ad_issue_reviews") as fin: reviews = json.load(fin) # build bag-of-words, corpus reviews = [[word for word in review if word not in stopwords.words('english')] for review in reviews] from collections import defaultdict freq = defaultdict(int) for review in reviews: for token in review: freq[token] += 1 reviews = [[token for token in review if freq[token] > 1] for review in reviews] dictionary = corpora.Dictionary(reviews) corpus = [dictionary.doc2bow(review) for review in reviews] import pyLDAvis.gensim as gensimvis import pyLDAvis vis_data = gensimvis.prepare(lda, corpus, dictionary) pyLDAvis.display(vis_data)
def load_resources(): """ loads all of the serialize objects for the recommender to work. Books identifies exist in realms: there is the gutenberg book id and the index where the book exsists in the corpus. These are not identical. :return: model object, corpus vects object, list of gutenberg ids, dictionary of book index number to id """ model = LdaMulticore.load(path.join(_RELATIVE_DIR, _CURRENT_MODEL)) corpus = _unpickle(path.join(_RELATIVE_DIR, _CURRENT_CORPUS)) ids = _unpickle(path.join(_RELATIVE_DIR, _CURRENT_TITLES)) ids_to_ind_dict = { int(id_loop): ind_loop for ind_loop, id_loop in enumerate(ids) } distance_mat_location = path.join(_RELATIVE_DIR, _CURRENT_MODEL + '.distance_matrix.pkl') if path.isfile(distance_mat_location): dist_mat = _unpickle(distance_mat_location, True) else: dist_mat = None return model, corpus, ids, ids_to_ind_dict, dist_mat
from gensim import corpora from gensim.models import LdaMulticore import string # loading the dataframe cleaned_data = pd.read_csv('./dataframes/final_df.csv', index_col=0, nrows=1000) # selecting the 3grams_reviews for topic modeling cleaned_data['3gram_reviews'] = cleaned_data['3gram_reviews'].map( lambda x: ''.join(c for c in x if c == '_' or c not in string.punctuation).split()) # selecting the model lda_model = LdaMulticore.load('.model/model.model') documents = list(cleaned_data['3gram_reviews']) dictionary = gensim.corpora.Dictionary(documents) # loading the word dict dictionary_saved = gensim.corpora.Dictionary.load( './model/model.model.id2word') corpus = [dictionary_saved.doc2bow(text) for text in documents] # making the topic dict topicDictionary = { '0': 'Network Performance', '1': 'Overall Experience', '2': 'Gameplay Mechanics', '3': 'Content/Value', '4': 'NO TOPIC',
def load_from_file(self): from gensim.models import LdaMulticore return LdaMulticore.load("lda_out_%d.model" % self.id)
comments_corpus = text2corpus(comments_text_filtered, dictionary) del comments_text_filtered with open("temp_corpus.pickle", "wb") as f: pickle.dump((comments_corpus, dictionary), f) else: with open("temp_corpus.pickle", "rb") as f: comments_corpus, dictionary = np.array(pickle.load(f)) print("created corpus") print('Number of unique tokens: %d' % len(dictionary)) print('Number of documents: %d' % len(comments_corpus)) num_topics = 150 if args.load: model = LdaMulticore.load("topic_models/model_comments") else: model = LdaMulticore(comments_corpus, id2word=dictionary, num_topics=num_topics) print("model done") model.save("topic_models/model_comments") print(model.print_topics(20)) top_topics = model.top_topics(comments_corpus) #, num_words=20) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) #from pprint import pprint #pprint(top_topics)
from model.lda.preprocess import Preprocessor from model.util.file_parser import parse_dir_json if __name__ == '__main__': init_logger() log = logging.getLogger('lda_model') config = LdaConfig(sys.argv[1], 'lda_model').get_current_config() _, docs = zip(*parse_dir_json(config['data_path'])) preprocessed_docs = Preprocessor( max_workers=config['max_workers']).process_docs(docs) log.info("Loading model from %s", config['model_path']) lda_model = LdaMulticore.load(config['model_path']) log.info("Loading dictionary from %s", config['dict_path']) dictionary = Dictionary.load(config['dict_path']) coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_docs, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() import csv with open(config['coherence_path'], "a") as csv_file: writer = csv.writer(csv_file, delimiter=';') writer.writerow([config['topics'], coherence_lda])
def LDA_model_out_of_time_tpot(df, features, target, dest_all_model, downsample=False): X = df[features] y = df[target].astype('bool') scores = {'acc': [], 'f1': []} cf_matrix_val = np.zeros((2, 2), dtype=np.int) tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years') tbcv_folds = tbcv.split(df, validation_split_date=datetime.date(2008, 12, 31), date_column='sec_filing_date') k_folds = len(tbcv_folds) for k_index, (train_index, test_index) in enumerate(tbcv_folds): dest_train, dest_val = dest_all_model[str(k_index + 1)] data_train = X.loc[train_index].drop('sec_filing_date', axis=1) target_train = y.loc[train_index] data_test = X.loc[test_index].drop('sec_filing_date', axis=1) target_test = y.loc[test_index] print("=========================================") print("==== K Fold Validation step => %d/%d ======" % (k_index + 1, k_folds)) print("=========================================") lda_model_train = LdaMulticore.load(lda_data_dir + dest_train) if downsample: try: data_train = pickle.load( open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_downsample_data_train.list', "rb")) except: print("Prepare Train data") data_train = get_topic_proba(data_train, lda_model_train) pickle.dump( data_train, open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_downsample_data_train.list', "wb")) try: data_test = pickle.load( open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_downsample_data_test.list', "rb")) except: print("Prepare Test data") data_test = get_topic_proba(data_test, lda_model_train) pickle.dump( data_test, open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_downsample_data_test.list', "wb")) else: try: data_train = pickle.load( open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_data_train.list', "rb")) except: print("Prepare Train data") data_train = get_topic_proba(data_train, lda_model_train) pickle.dump( data_train, open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_data_train.list', "wb")) try: data_test = pickle.load( open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_data_test.list', "rb")) except: print("Prepare Test data") data_test = get_topic_proba(data_test, lda_model_train) pickle.dump( data_test, open( main_dir + 'data/topic_predictions/' + str(k_index + 1) + '_data_test.list', "wb")) clf = TPOTClassifier(generations=5, population_size=50, verbosity=2, max_time_mins=5) clf.fit(data_train, target_train.values.ravel()) preds = clf.predict(data_test) # accuracy for the current fold only score = clf.score(data_test, target_test) f1 = f1_score(target_test, preds) cf_matrix_val += confusion_matrix(target_test, preds) scores['acc'].append(score) scores['f1'].append(f1) print("Cross Validation Score: " + str(sum(scores['acc']) / len(scores['acc']))) #Visualize confusion matrix for cross-val data labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos'] categories = ['No Downgrade', 'Downgrade'] make_confusion_matrix(cf_matrix_val, group_names=labels, categories=categories, cbar=False, title='Confusion Matrix: TPOT', figsize=(10, 10)) return scores, clf, cf_matrix_val
def load_model(self, model_path, dict_path): logger.info("Loading model from {}".format(model_path)) self.model = LdaMulticore.load(model_path) self.dictionary = Dictionary.load(dict_path)
def pretrained(self,lda,dicti): dictionary = corpora.Dictionary.load(dicti) model = LdaMulticore.load(lda) return dictionary, model
def get_lda_model(mode=QUESTION): return LdaMulticore.load(os.path.join(MODEL_DIR, modes[mode].model))
res_reduced = results[indices] plot_sentiment_time(res_reduced, time_filtered, title_suffix=", for fluff flair", bins=60) time_filtered, indices = filter_comment_flair(comments_list, "created_utc", "IMAGE") res_reduced = results[indices] plot_sentiment_time(res_reduced, time_filtered, title_suffix=", for image flair", bins=60) if args.topic_model: topic_model = LdaMulticore.load(args.topic_model) if not args.load_preprocess: comments_text = load_all_comments(db_name=args.comments_cached, only="body") comments_text_filtered, dictionary, comments_tokenized = preprocess_text( comments_text) print("filtered text") comments_corpus = text2corpus(comments_text_filtered, dictionary) del comments_text_filtered with open("temp_corpus.pickle", "wb") as f: pickle.dump((comments_corpus, dictionary), f) else: with open("temp_corpus.pickle", "rb") as f:
tokens_or_none = [ tok.lower_ for tok in doc if not tok.is_stop and tok.is_alpha ] else: tokens_or_none = None else: tokens_or_none = None return (tokens_or_none, component) docs = defaultdict(list) models = defaultdict() print("Loading models...") for component in components: models[component] = LdaMulticore.load(models_path + component + '.model.topic') # Compute cross-corpus topic similarities diffs = defaultdict(defaultdict) for (train, test) in itertools.product(*(components, components)): print(f'Computing topic diff for {train} on {test}...') diff, annotation = models[train].diff(models[test]) diffs[train][test] = diff.tolist() fig, ax = plt.subplots(1, 1) img = plt.imshow(diff, cmap='gray') ax.set_title("Topic Model Difference Matrix") ax.set_ylabel(train)
def getModel(self): dictionary_file = get_current_state()["dictionary"] lda_file = get_current_state()["lda"] dictionary = corpora.Dictionary.load("models/" + dictionary_file) model = LdaMulticore.load("models/" + lda_file) return dictionary, model
import string import gensim from gensim.models import LdaMulticore from gensim import corpora import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns sns.set() # read the cleaned data frame model_data_frame = pd.read_csv('./dataframes/model2_df.csv', index_col=0) # load the trained model lda_model = LdaMulticore.load('./models/nouns_only/model/model.model') # Turn the required columns of data into lists to be used in creating a dictionary (using doc2bow) in the next few steps model_data_frame['clean_reviews'] = model_data_frame['clean_reviews'].map( lambda x: ''.join(c for c in x if c == '_' or c not in string.punctuation).split()) model_data_frame['3grams_nouns'] = model_data_frame['3grams_nouns'].map( lambda x: ''.join(c for c in x if c == '_' or c not in string.punctuation).split()) documents = list(model_data_frame['3grams_nouns']) dictionary = gensim.corpora.Dictionary(documents) # reload the dictionary that was created during model creation dictionary_saved = gensim.corpora.Dictionary.load( './models/nouns_only/model/model.model.id2word') # this dictionary already had filter_extremes() applied during # training step corpus = [dictionary_saved.doc2bow(text) for text in documents] # print the keywords associated with each topic topic_list = lda_model.print_topics(num_topics=5, num_words=15)
corpus = [dictionary.doc2bow(review["words"]) for review in corpus_list] BleiCorpus.serialize(corpus_path, corpus, id2word=dictionary) corpus = corpora.BleiCorpus(corpus_path) print "running lda" lda = gensim.models.LdaMulticore(corpus, num_topics=lda_num_topics, id2word=dictionary, minimum_probability=0., workers=8) lda.save(lda_model_path) print "done lda" dictionary = corpora.Dictionary.load(dictionary_path) corpus = corpora.BleiCorpus(corpus_path) lda = LdaMulticore.load(lda_model_path) i = 0 for topic in lda.show_topics(lda_num_topics): print type(topic) print 'Topic #' + str(i) + ': ' + str(topic) i += 1 nwords_per_topic = 10 topic_words_set = set() for i in range(lda_num_topics): topic_terms = lda.get_topic_terms(i, topn=nwords_per_topic) for term in topic_terms: topic_words_set.add(term[0]) # print 'Topic_Words = ', topic_words_set print 'Topic_Words_len = ', len(topic_words_set) topic_word_id = dict(zip(topic_words_set, range(len(topic_words_set))))
def summarize(csv_content): cleaned_data = pd.read_csv(csv_content, index_col=0, nrows=1500) cleaned_data['3gram_reviews'] = cleaned_data['3gram_reviews'].map( lambda x: ''.join(c for c in x if c == '_' or c not in string.punctuation).split()) lda_model = LdaMulticore.load('model.model') documents = list(cleaned_data['3gram_reviews']) dictionary = gensim.corpora.Dictionary(documents) dictionary_saved = gensim.corpora.Dictionary.load('model.model.id2word') corpus = [dictionary_saved.doc2bow(text) for text in documents] topicDictionary = { '0': 'Performance', '1': 'Overall_Experience', '2': 'Gameplay_Mechanics', '3': 'Content_Value', '4': 'NO_TOPIC', } def identifyReviewTopics(ldamodel=lda_model, corpus=corpus, documents=documents): reviewTopicsDataframe = pd.DataFrame() # Get main topic in each document for i, row in enumerate(ldamodel[corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topicNumber, prop_topic) in enumerate(row): if j == 0: wp = ldamodel.show_topic(topicNumber) topicKeywords = ", ".join([word for word, prop in wp]) reviewTopicsDataframe = reviewTopicsDataframe.append( pd.Series([ topicDictionary[str(topicNumber)], round(prop_topic, 4), topicKeywords ]), ignore_index=True ) # replaced int(topicNumber) with str(topicNumber) else: break reviewTopicsDataframe.columns = [ 'Dominant_Topic', 'Contribution_Percentage', 'Keywords' ] # Add original text to the end of the output originalDataframe = pd.DataFrame( cleaned_data[['review', '3gram_reviews']]) reviewTopicsDataframe = pd.concat( [reviewTopicsDataframe, originalDataframe], axis=1) return reviewTopicsDataframe reviewTopicsDf = identifyReviewTopics() sentAnalyzer = SentimentIntensityAnalyzer() reviewTopicsDf['compound_sentiment'] = reviewTopicsDf['review'].map( lambda x: sentAnalyzer.polarity_scores(x)['compound']) sentimentDictionary = {} for topic in list(topicDictionary.values()): isCurrentTopic = reviewTopicsDf['Dominant_Topic'] == topic topicDf = reviewTopicsDf[isCurrentTopic] sentimentList = topicDf['compound_sentiment'] positiveList = [x for x in sentimentList if x > 0.1] negativeList = [x for x in sentimentList if x < -0.1] neutralList = [ x for x in sentimentList if x not in positiveList and x not in negativeList ] totalList = len(sentimentList) print(topic) print(totalList) if (totalList > 0): positivePercentage = len(positiveList) / totalList neutralPercentage = len(neutralList) / totalList negativePercentage = len(negativeList) / totalList sentimentDictionary[topic] = [ round(positivePercentage, 3), round(neutralPercentage, 3), round(negativePercentage, 3) ] # output percentages as positive, neutral, then negative print("\nBy review\n", sentimentDictionary) return sentimentDictionary
def __init__(self): self.dictionary = corpora.Dictionary.load(dictionary_path) self.lda = LdaMulticore.load(lda_model_path)
def load_from_file(self): from gensim.models import LdaMulticore return LdaMulticore.load("lda_out_%d.model" % self.id)
def infer(self, dataset): path = '../models.nosync/lda/model' model = LdaMulticore.load(path) corpus, dictionary = self._prepare(dataset) return corpus, model
def load_lda_model_multicores(filepath): return LdaMulticore.load(filepath)
def model(n_topics, alpha=None, beta=None, saved=False, pyldavis=False, wordclouds=False, rep_letters=False, plots=False) -> dict: assert n_topics >= 2 """ aux functions to make sure it's loading the desired model """ def verify_alpha(lda_model, given): actual: list = lda_model.alpha if given == "asymmetric": return not np.isclose(actual[0], actual[-1]) elif given == "symmetric": return np.isclose(actual[0], actual[-1]) else: return np.isclose(given, actual[0]) and np.isclose( given, actual[-1]) def verify_beta(lda_model, given): actual = lda_model.eta if type(given) == float: return np.isclose(given, actual[0]) and np.isclose( given, actual[-1]) # basic == comparison doesn't work bc floats suck else: return False print(f"Building LDA model for {n_topics} topics.") if saved: lda = LdaMulticore.load(f"{TRAINED_LDA}{n_topics}") # if not (verify_alpha(lda, alpha) and verify_beta(lda, beta)): # print("Loaded model didn't pass parameter verification; train it from scratch or load the correct one.") # return print(f"Trained LDA model with {n_topics} topics loaded successfully.") else: lda = LdaMulticore( corpus, num_topics=n_topics, id2word=dictionary, passes=20, alpha=alpha if alpha is not None else "symmetric", # default eta=beta, random_state=1, iterations=100, eval_every=5, workers=3, per_word_topics=True) lda.save(f"{TRAINED_LDA}{n_topics}") print( f"LDA model with {n_topics} topics trained and saved successfully." ) """ save per-word-topics 3D matrix [!] alters global variable """ V = len(dictionary) K = n_topics N = len(corpus) global pwt pwt = np.zeros((V, K, N)) """ save topic assignment info in dataframes [!] alters global variables """ global vw global vws vws = get_topic_dists_dataframe(lda) vw, vws = set_main_topics(vw, vws) """ coherence and silhouette scores """ coherence = CoherenceModel(model=lda, texts=letters, dictionary=dictionary, coherence='c_v').get_coherence() print(f"Coherence score: {coherence}") # the higher the better avg_silhouette = plot_silhouette(vws) print(f"Average silhouette coefficient: {avg_silhouette}" ) # the higher the better """ other validation methods """ if pyldavis: vis = pyLDAvis.gensim.prepare(topic_model=lda, corpus=corpus, dictionary=dictionary, n_jobs=3) pyLDAvis.save_html(vis, f"{PYLDAVIS_PATH}/lda{n_topics}.html") if rep_letters: save_representative_letters(vws, 3) if wordclouds: save_topic_wordclouds(pwt) if plots: plot_topics_per_year(vw) plot_topics_per_recipient(vw) return { "model": lda, "num_topics": n_topics, "alpha": alpha, "beta": beta, "coherence": coherence, "silhouette": avg_silhouette, "vws": vws, "pwt": pwt }
from gensim.models import LdaMulticore as LDA import os import argparse parser = argparse.ArgumentParser(description='View generated topics') parser.add_argument('--n_topics', help='Number of Topics') parser.add_argument('--gram', help='unigram or both') args = parser.parse_args() model = LDA.load(os.getcwd() + "/LDA models/{}/{}-topics".format(args.gram, args.n_topics)) for (a, b) in model.show_topics(): print("Topic-{} \n".format(a)) print(b) print("\n")