コード例 #1
0
ファイル: _0.py プロジェクト: haha405pan/applied-NLP-smm694
def compute_coherence_values(_dictionary, _corpus, _texts, _limit, _start,
                             _step, _path, _seed):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    -----------
    dictionary : Gensim dictionary
    corpus     : Gensim corpus
    texts      : List of input texts
    limit      : Max number of topics

    Returns:
    --------
    model_list       : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model
                       with respective number of topics
    """
    # containers
    coherence_values = []
    model_list = []
    for num_topics in range(_start, _limit, _step):
        model = gensim.models.wrappers.Lda(mallet_path=_path,
                                           corpus=_corpus,
                                           num_topics=num_topics,
                                           id2word=_dictionary,
                                           random_seed=_seed)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=_texts,
                                        dictionary=_dictionary,
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    # return output
    return model_list, coherence_values
コード例 #2
0
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
コード例 #3
0
def compute_coherence_s(num_topic):
    print(f"Computing for num topic {num_topic}")
    ldamodel_s = LdaModel(corpus_s,
                          num_topics=num_topic,
                          id2word=dict_corp_s,
                          passes=15)
    return CoherenceModel(
        model=ldamodel_s,
        texts=order_lag_one['stylist_processed_msg'].tolist(),
        dictionary=dict_corp_s,
        coherence='c_v').get_coherence()
コード例 #4
0
    def search_lda(self, data_train, data_val, min_topics=5, max_topics=20):
        doc_train = self.clean(data_train["opinion"])
        doc_val = self.clean(data_val["opinion"])
        self.id2word = corpora.Dictionary(doc_train)
        train_corpus = self.get_corpus(data_train)
        lda_search = []
        coherences = []
        for t in range(min_topics, max_topics + 1):
            lda = gensim.models.wrappers.LdaMallet("../../mallet-2.0.8/bin/mallet",
                                                   corpus=train_corpus,
                                                   id2word=self.id2word,
                                                   num_topics=t,
                                                   random_seed=utils.RANDOM_SEED)
            lda_search.append(lda)
            coherence_model = CoherenceModel(lda, texts=doc_val, dictionary=self.id2word, coherence='c_v')
            coherence = coherence_model.get_coherence()
            coherences.append(coherence)
        self.lda = lda_search[np.argmax(coherences)]

        return lda_search, coherences
コード例 #5
0
def compute_coherence_c(num_topic):
    print(f"Computing for num topic {num_topic}")
    ldamodel_ = LdaModel(corpus,
                         num_topics=num_topic,
                         id2word=dict_corp_c,
                         passes=15,
                         random_state=60616)
    return CoherenceModel(
        model=ldamodel_,
        texts=order_lag_one['customer_processed_msg'].tolist(),
        dictionary=dict_corp_c,
        coherence='c_v').get_coherence()
コード例 #6
0
import sys
import numpy as np

from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel, CoherenceModel
from gensim.models import word2vec

data_file = sys.argv[1]
alpha = float(sys.argv[2])

sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2]

dic = Dictionary(sentences)

corpus = [dic.doc2bow(s) for s in sentences]

for i in range(1, 31):
  lda = LdaModel(corpus = corpus, id2word = dic, num_topics = i, alpha = 0.01, random_state = 1)

  cm = CoherenceModel(model = lda, corpus = corpus, coherence = 'u_mass')
  coherence = cm.get_coherence()

  perwordbound = lda.log_perplexity(corpus)
  perplexity = np.exp2(-perwordbound)

  print(f"num_topics = {i}, coherence = {coherence}, perplexity = {perplexity}")
コード例 #7
0
alpha = float(sys.argv[3])

sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2]

dic = Dictionary(sentences)

corpus = [dic.doc2bow(s) for s in sentences]

coh_list = ['u_mass', 'c_v', 'c_w2v', 'c_uci', 'c_npmi']

print(','.join(['topic_num'] + coh_list))

for i in range(1, max_topic_num + 1):

    lda = LdaModel(corpus=corpus,
                   id2word=dic,
                   num_topics=i,
                   alpha=alpha,
                   random_state=1)

    coh = [
        str(
            CoherenceModel(model=lda,
                           corpus=corpus,
                           texts=sentences,
                           coherence=c,
                           processes=1).get_coherence()) for c in coh_list
    ]

    print(','.join([str(i)] + coh))
コード例 #8
0
#Also use TDIF topic modelling in future

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=32, id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

#sample doc test score

# for index, score in sorted(lda_model_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]):
#     print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 30)))

from gensim.models.ldamodel import CoherenceModel 
from gensim.models.ldamodel import LdaModel

coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
コード例 #9
0
sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2]

dic = Dictionary(sentences)

corpus = [dic.doc2bow(s) for s in sentences]

print('topic_num,avg,bound,perplexity,coherence')

for i in range(1, max_topic_num + 1):

    lda = LdaModel(corpus=corpus,
                   id2word=dic,
                   num_topics=i,
                   alpha=alpha,
                   random_state=1)

    avg_topics = mean([len(t) for t in [lda[c] for c in corpus]])

    bound = lda.bound(corpus)

    perwordbound = lda.log_perplexity(corpus)
    perplexity = np.exp2(-perwordbound)

    cm = CoherenceModel(model=lda,
                        corpus=corpus,
                        coherence='u_mass',
                        processes=1)
    coherence = cm.get_coherence()

    print(f"{i},{avg_topics},{bound},{perplexity},{coherence}")