Python LdaMallet.load Exemples, gensim.models.wrappers.LdaMallet.load Python Exemples

Exemple #1

0

Afficher le fichier

def generate_topic_words():  # output to csv files.
    pbar = tqdm.tqdm(total=len(models_path))
    i = 0
    for model_path in models_path:
        df = pd.DataFrame()
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)
        topics_dictionary = lda_model.show_topics(num_topics=num_topics[i],
                                                  num_words=30,
                                                  formatted=False)

        for topic in topics_dictionary:
            topic_num = topic[0] + 1
            terms_string = ''
            for term in topic[1]:
                terms_string += term[0] + ', '
            df = df.append(pd.Series([topic_num, terms_string[:-2]]),
                           ignore_index=True)

        output_path = f'./turn-in/{bigram_threshold}/topic_terms/{num_topics[i]}.csv'
        create_file(output_path)
        df.columns = ['Topic', 'Terms']
        df.sort_values(by=['Topic'], ascending=True,
                       inplace=True)  # Sort columns in ascending order
        df.to_csv(output_path, index=False)
        pbar.update(1)
        i = i + 1
    pbar.close()

Exemple #2

0

Afficher le fichier

Fichier : extract_topical_trends.py Projet : jaikaggarwal/covid19-demography

def explore_temporal_trends(topics, gender):
    model = LdaMallet.load('ldamodel.' + gender + '.' + str(topics))
    topic_words = model.show_topics(num_topics=topics, num_words=100, formatted=False)
    words_only = [(tp[0], [wd[0] for wd in tp[1]]) for tp in topic_words]

    words = list()
    for tp in words_only: words.extend(tp[1])

    weekly_stats = dict()
    data = Serialization.load_obj('week2comments.' + gender)
    for week in range(len(data)):
        if week not in data: continue
        print('processing data for week', week)
        week_data = ' '.join(data[week]).lower()
        total = len(week_data.split())

        for word in set(words):
            current_word_stats = weekly_stats.get(word, list())
            current_word_stats.append(float(week_data.split().count(word))/total)
            weekly_stats[word] = current_word_stats
        # end for
    # end for

    for word in weekly_stats:
        y = np.array(weekly_stats[word])
        x = np.array([i for i in range(len(weekly_stats[word]))]).reshape(-1, 1)
        model = LinearRegression().fit(x, y)
        print(word, model.coef_[0])

Exemple #3

0

Afficher le fichier

Fichier : chatroom.py Projet : lgoldberg9/SamBot

def chatbot_thread(bot_name, lda_path, dict_path, dom_path):

    # Get the stop words
    stop_words = stopwords.words('english')

    # Load pre-trained modules
    lda = LdaMallet.load(lda_path)
    dictionary = Dictionary.load(dict_path)
    dominant_topics = pd.read_csv(dom_path)

    # Announce bots entrance
    post_message(bot_name, "Hello there!", True)

    # Loop forever until program termination
    while True:
        while message_queue.empty():
            continue
        user_message = message_queue.get()

        prob_still_chat = min(random(), INITIAL_CHATTINESS)

        # Loop forever to elicit multiple responses to a single query with decreasing probability
        responding = True
        while responding:
            response = get_response(user_message, lda, dictionary,
                                    dominant_topics, stop_words)
            post_message(bot_name, response, True)
            if prob_still_chat < CHATTINESS:
                responding = False
            else:
                prob_still_chat *= prob_still_chat

Exemple #4

0

Afficher le fichier

def generate_topic_proportion_terms(
):  # This calculation is based on dominant topic belonged to each doc.
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)

    pbar = tqdm.tqdm(total=len(models_path))
    i = 0
    for model_path in models_path:
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)

        df_dominant_topic_document = dominant_topics(lda_model=lda_model,
                                                     corpus=corpus)
        # Number of Documents for Each Topic
        topic_counts = df_dominant_topic_document[
            'Dominant_Topic'].value_counts()

        # Percentage of Documents for Each Topic
        topic_contribution = round(topic_counts / topic_counts.sum(), 4)

        # Topic Nums
        topic_nums = pd.Series(topic_contribution.index,
                               topic_contribution.index)

        topic_terms = pd.Series()
        # Topic Terms
        topics_dictionary = lda_model.show_topics(num_topics=num_topics[i],
                                                  num_words=30,
                                                  formatted=False)
        for topic in topics_dictionary:
            topic_num = topic[0] + 1
            terms_string = ''
            for term in topic[1]:
                terms_string += term[0] + ', '
            topic_terms = topic_terms.append(
                pd.Series(terms_string[:-2], index=[topic_num * 1.0]))

        # Concatenate Column wise
        df_dominant_topics = pd.concat(
            [topic_nums, topic_counts, topic_contribution, topic_terms],
            axis=1)

        # Change Column names
        df_dominant_topics.columns = [
            'Topic', 'Count_Documents', 'Proportion_Over_Documents', 'Terms'
        ]
        df_dominant_topics.sort_values(
            by=['Topic'], ascending=True,
            inplace=True)  # Sort columns in ascending order

        output_path = f'./turn-in/{bigram_threshold}/topic_proportion_terms/{num_topics[i]}.csv'
        create_file(output_path)
        df_dominant_topics.to_csv(output_path, index=False)
        pbar.update(1)
        i = i + 1
    pbar.close()

Exemple #5

0

Afficher le fichier

def _load_model(type, fname='../../model/'):
    try:
        if type == 'lsi':
            return LsiModel.load(fname)
        elif type == 'lda':
            return LdaModel.load(fname)
        elif type == 'mallet':
            return LdaMallet.load(fname)
    except:
        return None

Exemple #6

0

Afficher le fichier

Fichier : quoterec_recommender.py Projet : rotimi-best/scripture_rec

 def train(self, train_filename):
     print("traini QuoteRec")
     train_name = os.path.basename(train_filename)
     model_filename = train_name + ".quoterec_model"
     if os.path.isfile(model_filename):
         self.model = LdaMallet.load(model_filename)
     else:
         self.corpus = preprocessing.GensimCorpus(train_filename)
         self.model.save(model_filename)
         topics_str = self.model.show_topics(num_topics=-1)
         open(train_name + ".lda_model.topics", 'w').write(str(topics_str))

Exemple #7

0

Afficher le fichier

def load_lda_model(lda_fname):
    """
    Loads an LDA model that could either be a Gensim trained LdaModel or a
    MALLET wrapper (an instance of LdaMallet).
    """
    
    if lda_fname.endswith('.lda.pickle'):
        return LdaModel.load(lda_fname)
    elif lda_fname.endswith('.ldamallet.pickle'):
        return LdaMallet.load(lda_fname)
    else:
        raise ValueError('filename {} does not end with either .lda.pickle '
                         'or .ldamallet.pickle'.format(repr(lda_fname)))

Exemple #8

0

Afficher le fichier

Fichier : lda_recommender.py Projet : rotimi-best/scripture_rec

 def train(self, train_filename):
     print("train LDA")
     train_name = os.path.basename(train_filename)
     model_filename = train_name + ".lda_model"
     if os.path.isfile(model_filename):
         self.model = LdaMallet.load(model_filename)
     else:
         self.corpus = preprocessing.GensimCorpus(train_filename)
         self.model = LdaMallet(mallet_path,
                                self.corpus,
                                num_topics=100,
                                id2word=self.corpus.dictionary)
         self.model.save(model_filename)
         topics_str = self.model.show_topics(num_topics=-1)
         open(train_name + ".lda_model.topics", 'w').write(str(topics_str))

Exemple #9

0

Afficher le fichier

def _load_model(model_type, fname):
    logger.info(f'{model_type} type of {fname} is loading..')
    try:
        if model_type == 'lsi':
            return LsiModel.load(f'../model/lsi_model/{fname}')
        elif model_type == 'lda':
            return LdaModel.load(f'../model/lda_model/{fname}')
        elif model_type == 'mallet':
            return LdaMallet.load(f'../model/mallet_model/{fname}')
        elif model_type == 'hdp':
            return HdpModel.load(f'../model/mallet_model/{fname}')
    except Exception as ex:
        logger.warning(f'{model_type} type of {fname} could not be loaded.',
                       exc_info=ex)
        return None

Exemple #10

0

Afficher le fichier

Fichier : lda_mallet_topic_model.py Projet : EngNahidBehjati/persona

 def get_model(self, lang: str, data_version: int, dictionary_version: float, model_version: str,param_name: str, param_version: int,
               language_processed_data: list, model_view: bool):
     logging.info("--- Getting LDA Mallet model")
     if self.model is None:
         model_file_path = Advisor.get_model_type_folders_file_path(lang, data_version,
                                                                    dictionary_version, model_version, param_name, param_version,
                                                                    self.model_type, "MLDA-model")
         if os.path.exists(model_file_path):
             self.model = LdaMallet.load(model_file_path)
         else:
             logging.info("---- LDA Mallet model was crated before")
             self.set_model(lang, data_version, dictionary_version, model_version, param_name, param_version, model_file_path,
                            language_processed_data)
     logging.info("--- LDA Mallet model captured")
     if model_view:
         self.visualization.get_model_visualizations(self.model_type, self.model, self.essentials.corpus,
                                                     language_processed_data)
     return self.model

Exemple #11

0

Afficher le fichier

def topic_model_coherence_generator(texts, start_topic_count, end_topic_count,
                                    step):
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count + 1,
                                 step)):
        dictionary_path = 'dictionary_mallet122_' + str(
            topic_nums) + '.dictionary'
        dictionary = corpora.Dictionary.load(dictionary_path)
        corpus = [dictionary.doc2bow(text) for text in texts]
        mallet_path = 'C:\\Users\\asus\\Desktop\\hypertension相关评价\\python生成结果\\LDA\\122文章\\mallet模型\\dictionary_mallet122_' + str(
            topic_nums) + '.model'
        mallet_lda_model = LdaMallet.load(mallet_path)
        cv_coherence_model_mallet_lda = CoherenceModel(model=mallet_lda_model,
                                                       corpus=corpus,
                                                       texts=texts,
                                                       dictionary=dictionary,
                                                       coherence='c_v')
        coherence_score = cv_coherence_model_mallet_lda.get_coherence()
        coherence_scores.append(coherence_score)
    return coherence_scores

Exemple #12

0

Afficher le fichier

def generate_topic_weight_terms():
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)

    pbar = tqdm.tqdm(total=len(models_path))
    i = 0
    for model_path in models_path:
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)
        df = topics_proportion(lda_model=lda_model,
                               corpus=corpus,
                               num_topics=num_topics[i])
        df.sort_values(by=['Topic'], ascending=True,
                       inplace=True)  # Sort columns in ascending order

        output_path = f'./turn-in/{bigram_threshold}/topic_proportion_terms/{num_topics[i]}.csv'
        create_file(output_path)
        df.to_csv(output_path, index=False)
        pbar.update(1)
        i = i + 1
    pbar.close()

Exemple #13

0

Afficher le fichier

def calculate_entropy_mallet_models():  # output to csv files.
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)

    index = 0
    dataset = pandas.read_csv(dataset_csv_path)
    for model_path in models_path:
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)

        df = pd.DataFrame()
        pbar = tqdm.tqdm(total=len(lda_model[corpus]))

        for i, row in enumerate(lda_model[corpus]):
            topic_dist = sorted(row, key=lambda x: (x[1]), reverse=True)
            rs_string = ''
            topic_entropy = 0
            for topic in topic_dist:
                rs_string = rs_string + 'Topic ' + str(topic[0] +
                                                       1) + ': ' + str(
                                                           topic[1]) + '; '
                topic_entropy = topic_entropy + (-math.log2(topic[1]))
            df = df.append(pd.Series([
                str(i), dataset['Submission_Num'][i], rs_string,
                str(topic_entropy), dataset['Submission_Text'][i]
            ]),
                           ignore_index=True)
            pbar.update(1)
        df.columns = [
            'Document_No', 'Submission_Num', 'Probabilities', 'Entropy',
            'Submission_Text'
        ]

        csv_file_result_path = f'./turn-in/{bigram_threshold}/model_entropy/{num_topics[index]}.csv'
        index = index + 1
        create_file(csv_file_result_path)
        df.to_csv(csv_file_result_path, index=False)
        pbar.close()

Exemple #14

0

Afficher le fichier

import gensim
from gensim.models.wrappers import LdaMallet
# If mallet doesn't work, use normal LDA.
from gensim.models.ldamodel import LdaModel
ldamallet = LdaMallet.load(
    '/home/ashwath/Programs/MAGCS/LDA/ldamallet_mag50.model')
lda = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(
    ldamallet, gamma_threshold=0.001, iterations=50)
lda.save('lda_mag50.model')

Exemple #15

0

Afficher le fichier

Fichier : estimated_topic_author_correlation.py Projet : aminmarani/authorless-tms

    vocab_index, phis, thetas = [None, None, None]
    print('Building phi and theta matrices')
    if tool == 'gensim':
        lda_model = LdaModel.load(args.ldamodel_fn)
        vocab_index = {term: i for i, term in lda_model.id2word.items()}
        thetas = get_gensim_thetas(args.in_tsv, vocab_index, lda_model)
        phis = lda_model.get_topics()
        phis = phis / phis.sum(axis=1, keepdims=True)
    elif tool == 'mallet':
        vocab, vocab_index = get_vocab(args.vocab_fn, True)
        thetas = get_mallet_thetas(args.doc_topics_fn)
        n_topics = thetas.shape[1]
        phis = get_mallet_phis(args.word_weights_fn, vocab_index, n_topics)
    elif tool == 'mallet-gensim':
        lda_model = LdaMallet.load(args.ldamodel_fn)
        vocab_index = {term: i for i, term in lda_model.id2word.items()}
        thetas = get_gensim_mallet_thetas(args.in_tsv, vocab_index, lda_model)
        phis = lda_model.get_topics()
        phis = phis / phis.sum(axis=1, keepdims=True)
    print('Building topic-author-term counts')
    (topic_term_counts,
     topic_author_term_counts) = estimate_topic_counts(args.in_tsv,
                                                       vocab_index,
                                                       author_index,
                                                       thetas,
                                                       phis,
                                                       verbose=args.verbose)
    print('Evaluating topic-author correlation')
    evaluate_correlation(topic_term_counts, topic_author_term_counts,
                         args.out_tsv)

Exemple #16

0

Afficher le fichier

Fichier : query-lda.py Projet : kiminh/Personalized-Visual-Art-Recommendation

 def load_model(self, path_to_model):
     """Load the LDA model"""
     lda_model = LdaMallet.load(path_to_model)
     return lda_model

Exemple #17

0

Afficher le fichier

hybrid_weights.extend(hybrid_weights)
hybrid_weights = np.array(hybrid_weights)
# Convert to probabilities
hybrid_weights = hybrid_weights / hybrid_weights.sum()

# GLOBAL num_items_to_pick (with replacement) -- high number: half a million
num_picks = 1000000

# LOAD MODELS
loadmodstart = time()
id2word_dictionary = corpora.Dictionary.load(
    '/home/ashwath/Programs/ACLAAn/LDA/aclmag.dict')
corpus = corpora.MmCorpus(
    '/home/ashwath/Programs/ACLAAn/LDA/aclmag_bow_corpus.mm')
try:
    ldamallet = LdaMallet.load(
        '/home/ashwath/Programs/ACLAAn/LDA/lda_model.model')
    vec_bow_test = id2word_dictionary.doc2bow(['test'])
    vec_ldamallet = ldamallet[vec_bow_test]
except subprocess.CalledProcessError:
    print("LDA MALLET COULDN'T READ INSTANCE FILE. USING NORMAL LDA INSTEAD")
    ldamallet = LdaModel.load(
        '/home/ashwath/Programs/ACLAAn/LDA/ldanormal_acl.model')

#index = similarities.MatrixSimilarity(ldamallet[corpus])
#index.save("simIndex.index")
malletindex = similarities.MatrixSimilarity.load(
    '/home/ashwath/Programs/ACLAAn/LDA/simIndexAcl.index')
with open(
        '/home/ashwath/Programs/ACLAAn/LDA/docid_to_magid_training_acl.pickle',
        'rb') as pick:
    docid_to_magid = pickle.load(pick)

Exemple #18

0

Afficher le fichier

Fichier : mallet.py Projet : wakita/nlp100

DICT_PATH  = 'docs.dict'
MODEL_PATH = 'docs.model'

raw_corpus = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]
docs = [doc.split() for doc in raw_corpus]

if exists(MODEL_PATH):
    print('Testing...\n')
    dict = corpora.Dictionary.load(DICT_PATH)
    lda  = LdaMallet.load(MODEL_PATH)
    for doc in docs:
        topics = lda[dict.doc2bow(doc)]
        print(topics, doc)
else:
    print('Training...\n')
    dictionary = corpora.Dictionary(docs)
    dictionary.save(DICT_PATH)
    corpus = [dictionary.doc2bow(text) for text in docs]

    lda = LdaMallet(MALLET_PATH, corpus=corpus,
                    num_topics=3, workers=60, id2word=dictionary, iterations=50, prefix=PREFIX)
    lda.save(MODEL_PATH)

Exemple #19

0

Afficher le fichier

Fichier : LdaMalletHandler.py Projet : siqueiralex/LdaMalletHandler

 def load_model(self, model_name):
     self.model_name = model_name
     self.dictionary  = corpora.Dictionary.load("ldamodels/"+self.model_name+"/dict.dict")
     self.model = LdaMallet.load("ldamodels/"+self.model_name+"/model.model")
     self.model.mallet_path = self.mallet_path

Exemple #20

0

Afficher le fichier

def load_lda_mallet_model(filepath):
    return LdaMallet.load(filepath)

Exemple #21

0

Afficher le fichier

Fichier : artefacts_helper.py Projet : kw01sg/biomedical-topic-modelling

def load_mallet_model(artefacts_path='./artefacts', suffix=''):
    model_path = str(Path(artefacts_path) / 'model')
    if suffix:
        model_path = model_path + f'_{suffix}'
    model = LdaMallet.load(model_path)
    return model

Exemple #22

0

Afficher le fichier

def load_gensim_file(file_name):
    return LdaMallet.load('data/gensim_models/' + file_name)

Exemple #23

0

Afficher le fichier

Fichier : predict_evaluate_arxiv.py Projet : Soumyajain29/HybridCite

from HyperDoc2Vec import *

snowball = SnowballStemmer(language='english')
nlp = spacy.load('en', disable=['parser', 'ner'])
nlp.Defaults.stop_words |= {'table', 'ref', 'formula', 'citation', 'cit', 'references'
                            'fig', 'figure', 'abstract', 'introduction',
                            'description','conclusion','results','discussion'}
mallet_path = '/home/ashwath/mallet-2.0.8/bin/mallet'


# LOAD MODELS
loadmodstart = time()
id2word_dictionary = corpora.Dictionary.load('/home/ashwath/Programs/ArxivCS/LDA/arxivmag.dict')
corpus = corpora.MmCorpus('/home/ashwath/Programs/ArxivCS/LDA/arxivmag_bow_corpus.mm')
try:
    ldamallet = LdaMallet.load('/home/ashwath/Programs/ArxivCS/LDA/ldamallet_arxiv.model')
    vec_bow_test = id2word_dictionary.doc2bow(['test'])
    vec_ldamallet = ldamallet[vec_bow_test]
except subprocess.CalledProcessError:
    print("LDA MALLET COULDN'T READ INSTANCE FILE. USING NORMAL LDA INSTEAD")
    ldamallet = LdaModel.load('/home/ashwath/Programs/ArxivCS/LDA/lda_arxiv.model')
    
malletindex = similarities.MatrixSimilarity.load('/home/ashwath/Programs/ArxivCS/LDA/simIndexArxiv.index')
with open('/home/ashwath/Programs/ArxivCS/LDA/docid_to_magid_training_arxiv.pickle', 'rb') as pick:
    docid_to_magid = pickle.load(pick)

hd2vmodel = HyperDoc2Vec.load('/home/ashwath/Programs/ArxivCS/hyperdoc2vec_arxivmag/models/hd2v_arxivmag.model')
print("MODELS took {} seconds to load".format(time()-loadmodstart))

def remove_stopwords(context):
    #print("Removing stop words.")

Exemple #24

0

Afficher le fichier

Fichier : examples.py Projet : brianwilcken/gc_text_analysis

                                     id2word=dictionary,
                                     prefix=path_to_mallet_output,
                                     workers=multiprocessing.cpu_count())
        mallet_lda_model.save('{}lda_model.pkl'.format(path_to_mallet_output))
        # mallet_lda_model.save('{}lda_model_{}.pkl'.format(path_to_mallet_output, uuid))
        print('calculate model coherence C_v score')
        coherence_model_lda = CoherenceModel(model=mallet_lda_model,
                                             texts=docs,
                                             dictionary=dictionary,
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('model coherence score: {}'.format(coherence_lda))
    else:
        print('load mallet LDA model')
        # mallet_lda_model = LdaMallet.load('{}lda_model.pkl'.format(path_to_mallet_output))
        mallet_lda_model = LdaMallet.load('{}lda_model_{}.pkl'.format(
            path_to_mallet_output, uuid))

    # # #convert the model to gensim format
    # # lda_model = malletmodel2ldamodel(mallet_lda_model)
    # # lda_model.save('{}gensim_lda_model.pkl'.format(path_to_mallet_output))

    # # topics = np.array(list(zip(*lda_model.show_topics(lda_model.num_topics, num_words=20, formatted=False)))[1])
    # # topics = topics[:, :, 0]
    # # topics = pd.DataFrame(topics)
    # # topics.to_csv('{}topics_with_20_words.csv'.format(path_to_mallet_output))

    # #determine which words will be included in each topic -> keyword string mapping
    # print('Determining word weight per topic')
    # topic_ids = mallet_lda_model.word_topics.argsort()[:, ::-1]
    # topic_weights = mallet_lda_model.word_topics
    # topic_weights.sort()