Python LdaMulticore.load Exemples, gensim.models.LdaMulticore.load Python Exemples

Exemple #1

0

Afficher le fichier

def displayTopics():
    dictionary = corpora.Dictionary.load(dictionary_path)
    corpus = corpora.BleiCorpus(corpus_path)
    lda = LdaMulticore.load(lda_model_path)
    i = 0
    for topic in lda.show_topics(lda_num_topics):
        print 'Topic #' + str(i) + ': ' + str(topic)
        i += 1

Exemple #2

0

Afficher le fichier

def ldaplot():
    model = LdaMulticore.load('lda.model')
    with open('bow_dict.pk', 'rb') as f:
        bow, dict = pickle.load(f)
    # pdb.set_trace()
    vis = pyLDAvis.gensim.prepare(model, bow, dict)
    # pyLDAvis.display(vis)
    # pdb.set_trace()
    pyLDAvis.save_html(vis, 'lda.html')

Exemple #3

0

Afficher le fichier

Fichier : predictor.py Projet : trkohler/dla_model

    def __init__(self, lda_path, dict_path):
        """
        lda_path - path to lda model
        dict_path - path to dict

        param: lda_path str
        param: dict_path str
        """
        self.dictionary = corpora.Dictionary.load(dict_path)
        self.lda = LdaMulticore.load(lda_path)

Exemple #4

0

Afficher le fichier

Fichier : pipeline.py Projet : frayalisia/questions

 def run(self):
     (model_file, dictionary_file), data_file = self.input()
     model = GensimLdaModel.load(model_file.path)
     dictionary = corpora.Dictionary.load(dictionary_file.path)
     data = pd.read_pickle(data_file.path)
     features = data.apply(self.get_features,
                           model=model,
                           dictionary=dictionary,
                           axis=1)
     features.to_pickle(self.output().path)

Exemple #5

0

Afficher le fichier

 def eval(self, dataset):
     path = '../models.nosync/lda/model'
     model = LdaMulticore.load(path)
     corpus, dictionary = self._prepare(dataset)
     x = model.log_perplexity(corpus)
     print(x)
     for i, (d, t) in enumerate(zip(corpus, dataset.titles)):
         print(t)
         for j, s in model.get_document_topics(d):
             print(dictionary.id2token[j], end=' ')
         print('\n')

Exemple #6

0

Afficher le fichier

Fichier : update_headlines.py Projet : Derpimort/News-popularity-analysis

def getLDA(topics):
    corp_d=Dictionary.load(MODEL_DIR+"corpus_dict.model")
    
    lda=LdaMulticore.load(MODEL_DIR+"lda.model")
    pp_docs=[]
    for topic in topics:
        pp_docs.append(lemma_pp(topic))
    dtm=[corp_d.doc2bow(doc) for doc in pp_docs]
    tfidf=TfidfModel(dtm)
    corp_tfidf=tfidf[dtm]
    return list(lda[tfidf[dtm]])

Exemple #7

0

Afficher le fichier

Fichier : 10_LDA_out-of-time_validation.py Projet : CanHuang11/sec-filings-nlp

def create_dist_matrix (model_a_dest, model_b_dest, distance='jaccard', num_words=300, normed=True):
    
    a = LdaMulticore.load(model_a_dest)
    b = LdaMulticore.load(model_b_dest)

    mdiff_a_b, annotation_a_b = a.diff(b, distance=distance, num_words=num_words, normed=normed)
    mdiff_b_a, annotation_b_a = b.diff(a, distance=distance, num_words=num_words, normed=normed)
    #topic_diff_a_b = np.ones(mdiff_a_b.shape) - mdiff_a_b
    #topic_diff_b_a = np.ones(mdiff_a_b.shape) - mdiff_b_a
    topic_diff_a_b = mdiff_a_b
    topic_diff_b_a = mdiff_b_a

    a_ones = np.ones(topic_diff_a_b.shape)
    b_ones = np.ones(topic_diff_a_b.shape)

    first_half = np.concatenate((a_ones, topic_diff_b_a), axis=0)
    second_half = np.concatenate((topic_diff_a_b, b_ones), axis=0)
    total = np.concatenate((first_half, second_half), axis=1)
    
    return total

Exemple #8

0

Afficher le fichier

Fichier : getTestData.py Projet : buddhikavidra/CSE-573-Final-Project-Document-Clustering-and-Visualization

def main():

    lda_model = LdaMulticore.load('lda.model')
    print('Successfully Loaded')
    print(lda_model)

    f = open('cnn_text.pickle', 'r')
    test_data1 = pickle.load(f)

    f = open('test_dataset.txt', 'r')
    test_data2 = pickle.load(f)

Exemple #9

0

Afficher le fichier

 def load(self,
          fname,
          return_dense=True,
          max_df=0.5,
          min_df=5,
          *args,
          **kwargs):
     lda = LdaMulticore.load(fname, *args, **kwargs)
     lda = LdaMulticore()
     alpha = lda.alpha
     eta = lda.eta
     iterations = lda.iterations
     random_seed = lda.random_state
     workers = lda.workers
     num_topics = lda.num_topics
     return GensimLDAVectorizer(num_topics, alpha, eta, workers, iterations,
                                return_dense, max_df, min_df, random_seed)

Exemple #10

0

Afficher le fichier

Fichier : extract_issues.py Projet : zengjichuan/AppReview-IDEA-

def visual_lda():
    lda = LdaMulticore.load("../model/lda.model")
    with open("../result/ad_issue_reviews") as fin:
        reviews = json.load(fin)
    # build bag-of-words, corpus
    reviews = [[word for word in review if word not in stopwords.words('english')] for review in reviews]
    from collections import defaultdict
    freq = defaultdict(int)
    for review in reviews:
        for token in review:
            freq[token] += 1
    reviews = [[token for token in review if freq[token] > 1] for review in reviews]
    dictionary = corpora.Dictionary(reviews)
    corpus = [dictionary.doc2bow(review) for review in reviews]
    import pyLDAvis.gensim as gensimvis
    import pyLDAvis
    vis_data = gensimvis.prepare(lda, corpus, dictionary)
    pyLDAvis.display(vis_data)

Exemple #11

0

Afficher le fichier

Fichier : recommender.py Projet : brendanlafferty/gutenberg_nlp

def load_resources():
    """
    loads all of the serialize objects for the recommender to work.  Books identifies exist in
    realms: there is the gutenberg book id and the index where the book exsists in the corpus.
    These are not identical.

    :return: model object, corpus vects object, list of gutenberg ids,
             dictionary of book index number to id
    """
    model = LdaMulticore.load(path.join(_RELATIVE_DIR, _CURRENT_MODEL))
    corpus = _unpickle(path.join(_RELATIVE_DIR, _CURRENT_CORPUS))
    ids = _unpickle(path.join(_RELATIVE_DIR, _CURRENT_TITLES))
    ids_to_ind_dict = {
        int(id_loop): ind_loop
        for ind_loop, id_loop in enumerate(ids)
    }
    distance_mat_location = path.join(_RELATIVE_DIR,
                                      _CURRENT_MODEL + '.distance_matrix.pkl')
    if path.isfile(distance_mat_location):
        dist_mat = _unpickle(distance_mat_location, True)
    else:
        dist_mat = None
    return model, corpus, ids, ids_to_ind_dict, dist_mat

Exemple #12

0

Afficher le fichier

from gensim import corpora
from gensim.models import LdaMulticore
import string

# loading the dataframe
cleaned_data = pd.read_csv('./dataframes/final_df.csv',
                           index_col=0,
                           nrows=1000)

# selecting the 3grams_reviews for topic modeling
cleaned_data['3gram_reviews'] = cleaned_data['3gram_reviews'].map(
    lambda x: ''.join(c for c in x
                      if c == '_' or c not in string.punctuation).split())

# selecting the model
lda_model = LdaMulticore.load('.model/model.model')
documents = list(cleaned_data['3gram_reviews'])
dictionary = gensim.corpora.Dictionary(documents)

# loading the word dict
dictionary_saved = gensim.corpora.Dictionary.load(
    './model/model.model.id2word')
corpus = [dictionary_saved.doc2bow(text) for text in documents]

# making the topic dict
topicDictionary = {
    '0': 'Network Performance',
    '1': 'Overall Experience',
    '2': 'Gameplay Mechanics',
    '3': 'Content/Value',
    '4': 'NO TOPIC',

Exemple #13

0

Afficher le fichier

Fichier : models.py Projet : hds-lab/textvisdrg-prototype

    def load_from_file(self):
        from gensim.models import LdaMulticore

        return LdaMulticore.load("lda_out_%d.model" % self.id)

Exemple #14

0

Afficher le fichier

Fichier : topic_modeling.py Projet : Argetlame-sc/sentiment_analysis_reddit

        comments_corpus = text2corpus(comments_text_filtered, dictionary)
        del comments_text_filtered
        with open("temp_corpus.pickle", "wb") as f:
            pickle.dump((comments_corpus, dictionary), f)
    else:
        with open("temp_corpus.pickle", "rb") as f:
            comments_corpus, dictionary = np.array(pickle.load(f))

    print("created corpus")
    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(comments_corpus))

    num_topics = 150
    if args.load:
        model = LdaMulticore.load("topic_models/model_comments")
    else:
        model = LdaMulticore(comments_corpus, id2word=dictionary, num_topics=num_topics)
        print("model done")
        model.save("topic_models/model_comments")

    print(model.print_topics(20))

    top_topics = model.top_topics(comments_corpus) #, num_words=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    #from pprint import pprint
    #pprint(top_topics)

Exemple #15

0

Afficher le fichier

from model.lda.preprocess import Preprocessor
from model.util.file_parser import parse_dir_json

if __name__ == '__main__':
    init_logger()
    log = logging.getLogger('lda_model')

    config = LdaConfig(sys.argv[1], 'lda_model').get_current_config()

    _, docs = zip(*parse_dir_json(config['data_path']))

    preprocessed_docs = Preprocessor(
        max_workers=config['max_workers']).process_docs(docs)

    log.info("Loading model from %s", config['model_path'])
    lda_model = LdaMulticore.load(config['model_path'])
    log.info("Loading dictionary from %s", config['dict_path'])
    dictionary = Dictionary.load(config['dict_path'])

    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=preprocessed_docs,
                                         dictionary=dictionary,
                                         coherence='c_v')

    coherence_lda = coherence_model_lda.get_coherence()

    import csv

    with open(config['coherence_path'], "a") as csv_file:
        writer = csv.writer(csv_file, delimiter=';')
        writer.writerow([config['topics'], coherence_lda])

Exemple #16

0

Afficher le fichier

def LDA_model_out_of_time_tpot(df,
                               features,
                               target,
                               dest_all_model,
                               downsample=False):

    X = df[features]
    y = df[target].astype('bool')

    scores = {'acc': [], 'f1': []}
    cf_matrix_val = np.zeros((2, 2), dtype=np.int)

    tbcv = TimeBasedCV(train_period=3, test_period=1, freq='years')
    tbcv_folds = tbcv.split(df,
                            validation_split_date=datetime.date(2008, 12, 31),
                            date_column='sec_filing_date')
    k_folds = len(tbcv_folds)
    for k_index, (train_index, test_index) in enumerate(tbcv_folds):

        dest_train, dest_val = dest_all_model[str(k_index + 1)]

        data_train = X.loc[train_index].drop('sec_filing_date', axis=1)
        target_train = y.loc[train_index]

        data_test = X.loc[test_index].drop('sec_filing_date', axis=1)
        target_test = y.loc[test_index]

        print("=========================================")
        print("==== K Fold Validation step => %d/%d ======" %
              (k_index + 1, k_folds))
        print("=========================================")

        lda_model_train = LdaMulticore.load(lda_data_dir + dest_train)

        if downsample:

            try:
                data_train = pickle.load(
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_downsample_data_train.list',
                        "rb"))
            except:
                print("Prepare Train data")
                data_train = get_topic_proba(data_train, lda_model_train)
                pickle.dump(
                    data_train,
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_downsample_data_train.list',
                        "wb"))

            try:
                data_test = pickle.load(
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_downsample_data_test.list', "rb"))
            except:
                print("Prepare Test data")
                data_test = get_topic_proba(data_test, lda_model_train)
                pickle.dump(
                    data_test,
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_downsample_data_test.list', "wb"))

        else:

            try:
                data_train = pickle.load(
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_data_train.list', "rb"))
            except:
                print("Prepare Train data")
                data_train = get_topic_proba(data_train, lda_model_train)
                pickle.dump(
                    data_train,
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_data_train.list', "wb"))

            try:
                data_test = pickle.load(
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_data_test.list', "rb"))
            except:
                print("Prepare Test data")
                data_test = get_topic_proba(data_test, lda_model_train)
                pickle.dump(
                    data_test,
                    open(
                        main_dir + 'data/topic_predictions/' +
                        str(k_index + 1) + '_data_test.list', "wb"))

        clf = TPOTClassifier(generations=5,
                             population_size=50,
                             verbosity=2,
                             max_time_mins=5)
        clf.fit(data_train, target_train.values.ravel())
        preds = clf.predict(data_test)

        # accuracy for the current fold only
        score = clf.score(data_test, target_test)

        f1 = f1_score(target_test, preds)

        cf_matrix_val += confusion_matrix(target_test, preds)
        scores['acc'].append(score)
        scores['f1'].append(f1)

    print("Cross Validation Score: " +
          str(sum(scores['acc']) / len(scores['acc'])))

    #Visualize confusion matrix for cross-val data
    labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
    categories = ['No Downgrade', 'Downgrade']
    make_confusion_matrix(cf_matrix_val,
                          group_names=labels,
                          categories=categories,
                          cbar=False,
                          title='Confusion Matrix: TPOT',
                          figsize=(10, 10))

    return scores, clf, cf_matrix_val

Exemple #17

0

Afficher le fichier

 def load_model(self, model_path, dict_path):
     logger.info("Loading model from {}".format(model_path))
     self.model = LdaMulticore.load(model_path)
     self.dictionary = Dictionary.load(dict_path)

Exemple #18

0

Afficher le fichier

 def pretrained(self,lda,dicti):
     dictionary = corpora.Dictionary.load(dicti)
     model = LdaMulticore.load(lda)
     return dictionary, model

Exemple #19

0

Afficher le fichier

Fichier : trainer.py Projet : Xnkr/cg-chatbot

def get_lda_model(mode=QUESTION):
    return LdaMulticore.load(os.path.join(MODEL_DIR, modes[mode].model))

Exemple #20

0

Afficher le fichier

Fichier : data_analysis.py Projet : Argetlame-sc/sentiment_analysis_reddit

    res_reduced = results[indices]
    plot_sentiment_time(res_reduced,
                        time_filtered,
                        title_suffix=", for fluff flair",
                        bins=60)

    time_filtered, indices = filter_comment_flair(comments_list, "created_utc",
                                                  "IMAGE")
    res_reduced = results[indices]
    plot_sentiment_time(res_reduced,
                        time_filtered,
                        title_suffix=", for image flair",
                        bins=60)

    if args.topic_model:
        topic_model = LdaMulticore.load(args.topic_model)

        if not args.load_preprocess:
            comments_text = load_all_comments(db_name=args.comments_cached,
                                              only="body")

            comments_text_filtered, dictionary, comments_tokenized = preprocess_text(
                comments_text)
            print("filtered text")

            comments_corpus = text2corpus(comments_text_filtered, dictionary)
            del comments_text_filtered
            with open("temp_corpus.pickle", "wb") as f:
                pickle.dump((comments_corpus, dictionary), f)
        else:
            with open("temp_corpus.pickle", "rb") as f:

Exemple #21

0

Afficher le fichier

            tokens_or_none = [
                tok.lower_ for tok in doc if not tok.is_stop and tok.is_alpha
            ]
        else:
            tokens_or_none = None
    else:
        tokens_or_none = None
    return (tokens_or_none, component)


docs = defaultdict(list)
models = defaultdict()

print("Loading models...")
for component in components:
    models[component] = LdaMulticore.load(models_path + component +
                                          '.model.topic')

# Compute cross-corpus topic similarities
diffs = defaultdict(defaultdict)

for (train, test) in itertools.product(*(components, components)):
    print(f'Computing topic diff for {train} on {test}...')
    diff, annotation = models[train].diff(models[test])
    diffs[train][test] = diff.tolist()

    fig, ax = plt.subplots(1, 1)

    img = plt.imshow(diff, cmap='gray')

    ax.set_title("Topic Model Difference Matrix")
    ax.set_ylabel(train)

Exemple #22

0

Afficher le fichier

Fichier : lda.py Projet : haviansyah/pro-ok

 def getModel(self):
     dictionary_file = get_current_state()["dictionary"]
     lda_file = get_current_state()["lda"]
     dictionary = corpora.Dictionary.load("models/" + dictionary_file)
     model = LdaMulticore.load("models/" + lda_file)
     return dictionary, model

Exemple #23

0

Afficher le fichier

Fichier : model_validation.py Projet : AmeshS17/SDGP-Project

import string
import gensim
from gensim.models import LdaMulticore
from gensim import corpora
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# read the cleaned data frame
model_data_frame = pd.read_csv('./dataframes/model2_df.csv', index_col=0)

# load the trained model
lda_model = LdaMulticore.load('./models/nouns_only/model/model.model')

# Turn the required columns of data into lists to be used in creating a dictionary (using doc2bow) in the next few steps 
model_data_frame['clean_reviews'] = model_data_frame['clean_reviews'].map(
    lambda x: ''.join(c for c in x if c == '_' or c not in string.punctuation).split())
model_data_frame['3grams_nouns'] = model_data_frame['3grams_nouns'].map(
    lambda x: ''.join(c for c in x if c == '_' or c not in string.punctuation).split())

documents = list(model_data_frame['3grams_nouns'])
dictionary = gensim.corpora.Dictionary(documents)
# reload the dictionary that was created during model creation
dictionary_saved = gensim.corpora.Dictionary.load(
    './models/nouns_only/model/model.model.id2word')  # this dictionary already had filter_extremes() applied during
# training step
corpus = [dictionary_saved.doc2bow(text) for text in documents]
# print the keywords associated with each topic
topic_list = lda_model.print_topics(num_topics=5, num_words=15)

Exemple #24

0

Afficher le fichier

corpus = [dictionary.doc2bow(review["words"]) for review in corpus_list]
BleiCorpus.serialize(corpus_path, corpus, id2word=dictionary)

corpus = corpora.BleiCorpus(corpus_path)
print "running lda"
lda = gensim.models.LdaMulticore(corpus,
                                 num_topics=lda_num_topics,
                                 id2word=dictionary,
                                 minimum_probability=0.,
                                 workers=8)
lda.save(lda_model_path)
print "done lda"

dictionary = corpora.Dictionary.load(dictionary_path)
corpus = corpora.BleiCorpus(corpus_path)
lda = LdaMulticore.load(lda_model_path)
i = 0
for topic in lda.show_topics(lda_num_topics):
    print type(topic)
    print 'Topic #' + str(i) + ': ' + str(topic)
    i += 1

nwords_per_topic = 10
topic_words_set = set()
for i in range(lda_num_topics):
    topic_terms = lda.get_topic_terms(i, topn=nwords_per_topic)
    for term in topic_terms:
        topic_words_set.add(term[0])
# print 'Topic_Words = ', topic_words_set
print 'Topic_Words_len = ', len(topic_words_set)
topic_word_id = dict(zip(topic_words_set, range(len(topic_words_set))))

Exemple #25

0

Afficher le fichier

Fichier : generate_summary.py Projet : AmeshS17/SDGP-Project

def summarize(csv_content):

    cleaned_data = pd.read_csv(csv_content, index_col=0, nrows=1500)
    cleaned_data['3gram_reviews'] = cleaned_data['3gram_reviews'].map(
        lambda x: ''.join(c for c in x
                          if c == '_' or c not in string.punctuation).split())
    lda_model = LdaMulticore.load('model.model')
    documents = list(cleaned_data['3gram_reviews'])
    dictionary = gensim.corpora.Dictionary(documents)
    dictionary_saved = gensim.corpora.Dictionary.load('model.model.id2word')
    corpus = [dictionary_saved.doc2bow(text) for text in documents]

    topicDictionary = {
        '0': 'Performance',
        '1': 'Overall_Experience',
        '2': 'Gameplay_Mechanics',
        '3': 'Content_Value',
        '4': 'NO_TOPIC',
    }

    def identifyReviewTopics(ldamodel=lda_model,
                             corpus=corpus,
                             documents=documents):
        reviewTopicsDataframe = pd.DataFrame()
        # Get main topic in each document
        for i, row in enumerate(ldamodel[corpus]):
            row = sorted(row, key=lambda x: (x[1]), reverse=True)
            # Get the Dominant topic, Perc Contribution and Keywords for each document
            for j, (topicNumber, prop_topic) in enumerate(row):
                if j == 0:
                    wp = ldamodel.show_topic(topicNumber)
                    topicKeywords = ", ".join([word for word, prop in wp])
                    reviewTopicsDataframe = reviewTopicsDataframe.append(
                        pd.Series([
                            topicDictionary[str(topicNumber)],
                            round(prop_topic, 4), topicKeywords
                        ]),
                        ignore_index=True
                    )  # replaced int(topicNumber) with str(topicNumber)
                else:
                    break
        reviewTopicsDataframe.columns = [
            'Dominant_Topic', 'Contribution_Percentage', 'Keywords'
        ]
        # Add original text to the end of the output
        originalDataframe = pd.DataFrame(
            cleaned_data[['review', '3gram_reviews']])
        reviewTopicsDataframe = pd.concat(
            [reviewTopicsDataframe, originalDataframe], axis=1)
        return reviewTopicsDataframe

    reviewTopicsDf = identifyReviewTopics()
    sentAnalyzer = SentimentIntensityAnalyzer()
    reviewTopicsDf['compound_sentiment'] = reviewTopicsDf['review'].map(
        lambda x: sentAnalyzer.polarity_scores(x)['compound'])
    sentimentDictionary = {}

    for topic in list(topicDictionary.values()):
        isCurrentTopic = reviewTopicsDf['Dominant_Topic'] == topic
        topicDf = reviewTopicsDf[isCurrentTopic]
        sentimentList = topicDf['compound_sentiment']
        positiveList = [x for x in sentimentList if x > 0.1]
        negativeList = [x for x in sentimentList if x < -0.1]
        neutralList = [
            x for x in sentimentList
            if x not in positiveList and x not in negativeList
        ]
        totalList = len(sentimentList)
        print(topic)
        print(totalList)
        if (totalList > 0):
            positivePercentage = len(positiveList) / totalList
            neutralPercentage = len(neutralList) / totalList
            negativePercentage = len(negativeList) / totalList
            sentimentDictionary[topic] = [
                round(positivePercentage, 3),
                round(neutralPercentage, 3),
                round(negativePercentage, 3)
            ]
            # output percentages as positive, neutral, then negative
    print("\nBy review\n", sentimentDictionary)

    return sentimentDictionary

Exemple #26

0

Afficher le fichier

 def __init__(self):
     self.dictionary = corpora.Dictionary.load(dictionary_path)
     self.lda = LdaMulticore.load(lda_model_path)

Exemple #27

0

Afficher le fichier

    def load_from_file(self):
        from gensim.models import LdaMulticore

        return LdaMulticore.load("lda_out_%d.model" % self.id)

Exemple #28

0

Afficher le fichier

 def infer(self, dataset):
     path = '../models.nosync/lda/model'
     model = LdaMulticore.load(path)
     corpus, dictionary = self._prepare(dataset)
     return corpus, model

Exemple #29

0

Afficher le fichier

def load_lda_model_multicores(filepath):
    return LdaMulticore.load(filepath)

Exemple #30

0

Afficher le fichier

def model(n_topics,
          alpha=None,
          beta=None,
          saved=False,
          pyldavis=False,
          wordclouds=False,
          rep_letters=False,
          plots=False) -> dict:
    assert n_topics >= 2
    """
	aux functions to make sure it's loading the desired model
	"""
    def verify_alpha(lda_model, given):
        actual: list = lda_model.alpha
        if given == "asymmetric":
            return not np.isclose(actual[0], actual[-1])
        elif given == "symmetric":
            return np.isclose(actual[0], actual[-1])
        else:
            return np.isclose(given, actual[0]) and np.isclose(
                given, actual[-1])

    def verify_beta(lda_model, given):
        actual = lda_model.eta
        if type(given) == float:
            return np.isclose(given, actual[0]) and np.isclose(
                given,
                actual[-1])  # basic == comparison doesn't work bc floats suck
        else:
            return False

    print(f"Building LDA model for {n_topics} topics.")

    if saved:
        lda = LdaMulticore.load(f"{TRAINED_LDA}{n_topics}")

        # if not (verify_alpha(lda, alpha) and verify_beta(lda, beta)):
        # print("Loaded model didn't pass parameter verification; train it from scratch or load the correct one.")
        # return

        print(f"Trained LDA model with {n_topics} topics loaded successfully.")

    else:
        lda = LdaMulticore(
            corpus,
            num_topics=n_topics,
            id2word=dictionary,
            passes=20,
            alpha=alpha if alpha is not None else "symmetric",  # default
            eta=beta,
            random_state=1,
            iterations=100,
            eval_every=5,
            workers=3,
            per_word_topics=True)

        lda.save(f"{TRAINED_LDA}{n_topics}")
        print(
            f"LDA model with {n_topics} topics trained and saved successfully."
        )
    """
	save per-word-topics 3D matrix
	[!] alters global variable
	"""
    V = len(dictionary)
    K = n_topics
    N = len(corpus)
    global pwt
    pwt = np.zeros((V, K, N))
    """
	save topic assignment info in dataframes
	[!] alters global variables
	"""
    global vw
    global vws
    vws = get_topic_dists_dataframe(lda)
    vw, vws = set_main_topics(vw, vws)
    """
	coherence and silhouette scores
	"""
    coherence = CoherenceModel(model=lda,
                               texts=letters,
                               dictionary=dictionary,
                               coherence='c_v').get_coherence()
    print(f"Coherence score: {coherence}")  # the higher the better

    avg_silhouette = plot_silhouette(vws)
    print(f"Average silhouette coefficient: {avg_silhouette}"
          )  # the higher the better
    """
	other validation methods
	"""
    if pyldavis:
        vis = pyLDAvis.gensim.prepare(topic_model=lda,
                                      corpus=corpus,
                                      dictionary=dictionary,
                                      n_jobs=3)
        pyLDAvis.save_html(vis, f"{PYLDAVIS_PATH}/lda{n_topics}.html")

    if rep_letters:
        save_representative_letters(vws, 3)

    if wordclouds:
        save_topic_wordclouds(pwt)

    if plots:
        plot_topics_per_year(vw)
        plot_topics_per_recipient(vw)

    return {
        "model": lda,
        "num_topics": n_topics,
        "alpha": alpha,
        "beta": beta,
        "coherence": coherence,
        "silhouette": avg_silhouette,
        "vws": vws,
        "pwt": pwt
    }

Exemple #31

0

Afficher le fichier

Fichier : view_topics.py Projet : rohit-ganapathy/Topic-Modelling-LDA

from gensim.models import LdaMulticore as LDA
import os
import argparse

parser = argparse.ArgumentParser(description='View generated topics')
parser.add_argument('--n_topics', help='Number of Topics')
parser.add_argument('--gram', help='unigram or both')
args = parser.parse_args()

model = LDA.load(os.getcwd() +
                 "/LDA models/{}/{}-topics".format(args.gram, args.n_topics))

for (a, b) in model.show_topics():
    print("Topic-{} \n".format(a))
    print(b)
    print("\n")