def coherence_umass():
    topicanzahl = []
    coherence10 = []
    coherence20 = []
    for i in range(min, max + 1, step):
        topicanzahl.append(i)
        model = models.LdaModel.load("./Topic_Modeling/Models/Topic_Model_%i" %
                                     i)
        u_mass_10 = models.CoherenceModel(corpus=corpus,
                                          model=model,
                                          dictionary=dictionary,
                                          coherence='u_mass',
                                          topn=10).get_coherence()
        u_mass_20 = models.CoherenceModel(corpus=corpus,
                                          model=model,
                                          dictionary=dictionary,
                                          coherence='u_mass',
                                          topn=20).get_coherence()
        coherence10.append(u_mass_10)
        coherence20.append(u_mass_20)

    top10a = pandas.DataFrame(data=coherence10, index=topicanzahl)
    top20a = pandas.DataFrame(data=coherence20, index=topicanzahl)

    pandas.DataFrame(top10a).to_csv(
        "./Topic_Modeling/Evaluation/UMass_Score_10_words.csv",
        sep=';',
        decimal=',')
    pandas.DataFrame(top20a).to_csv(
        "./Topic_Modeling/Evaluation/UMass_Score_20_words.csv",
        sep=';',
        decimal=',')
Ejemplo n.º 2
0
def build_coherence_models(topic_model, **kwargs):
    u_mass = models.CoherenceModel(model=topic_model,
                                   corpus=kwargs['corpus'],
                                   dictionary=kwargs['dictionary'],
                                   coherence='u_mass')
    c_v = models.CoherenceModel(model=topic_model,
                                texts=kwargs['texts'],
                                corpus=kwargs['corpus'],
                                dictionary=kwargs['dictionary'],
                                coherence='c_v')
    c_uci = models.CoherenceModel(model=topic_model,
                                  texts=kwargs['texts'],
                                  corpus=kwargs['corpus'],
                                  dictionary=kwargs['dictionary'],
                                  coherence='c_uci')
    c_npmi = models.CoherenceModel(model=topic_model,
                                   texts=kwargs['texts'],
                                   corpus=kwargs['corpus'],
                                   dictionary=kwargs['dictionary'],
                                   coherence='c_npmi')
    return {
        'num_topics': topic_model.num_topics,
        'u_mass': u_mass.get_coherence(),
        'c_v': c_v.get_coherence(),
        'c_uci': c_uci.get_coherence(),
        'c_npmi': c_npmi.get_coherence()
    }
Ejemplo n.º 3
0
    def train(self, num_topics=None):
        if num_topics is not None:
            self._train(num_topics)

        else:
            highest = {
                'num_topic': 0,
                'coherence': 0
            }
            for num in itertools.count(1):
                self._train(num)
                cm = models.CoherenceModel(
                    model=self.ldamodel,
                    texts=self.collection,
                    dictionary=self.dictionary,
                    coherence='c_v'
                )
                coherence = cm.get_coherence()
                if coherence > highest['coherence']:
                    highest = {
                        'lda': copy.deepcopy(self.ldamodel),
                        'num_topic': num,
                        'coherence': coherence
                    }
                elif ((highest['coherence'] - coherence) > 0.2) \
                        or num >= 20:
                    break

            self.ldamodel = highest['lda']
Ejemplo n.º 4
0
    def score(self, X, y=None):
        """Compute score reflecting how well the model has fitted for the input data.

        The scoring method is set using the `scorer` argument in :meth:`~gensim.sklearn_api.ldamodel.LdaTransformer`.
        Higher score is better.

        Parameters
        ----------
        X : iterable of list of (int, number)
            Sequence of documents in BOW format.

        Returns
        -------
        float
            The score computed based on the selected method.

        """
        if self.scorer == 'perplexity':
            corpus_words = sum(cnt for document in X for _, cnt in document)
            subsample_ratio = 1.0
            perwordbound = \
                self.gensim_model.bound(X, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)
            return -1 * np.exp2(
                -perwordbound
            )  # returning (-1*perplexity) to select model with minimum value
        elif self.scorer == 'u_mass':
            goodcm = models.CoherenceModel(model=self.gensim_model,
                                           corpus=X,
                                           coherence=self.scorer,
                                           topn=3)
            return goodcm.get_coherence()
        else:
            raise ValueError(
                "Invalid value {} supplied for `scorer` param".format(
                    self.scorer))
Ejemplo n.º 5
0
def compute_coherence_values(dictionary, tfidf_corpus, corpus, start, stop,
                             step):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for number_of_topics in range(start, stop, step):
        # generate LDA model
        lda = models.LdaModel(tfidf_corpus,
                              num_topics=number_of_topics,
                              id2word=dictionary)  # train model
        model_list.append(lda)
        coherencemodel = models.CoherenceModel(model=lda,
                                               texts=formatted,
                                               dictionary=dictionary,
                                               coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return coherence_values
Ejemplo n.º 6
0
    def train(self, num_topics=None):
        if num_topics is not None:
            self.ldamodels = [
                self._train(num_topics[key], collection)
                for key, collection in enumerate(self.collections)
            ]

        else:
            self.ldamodels = []
            for collection in self.collections:
                highest = {'num_topic': 0, 'coherence': 0}
                for num in itertools.count(1):
                    ldamodel = self._train(num, collection)
                    cm = models.CoherenceModel(
                        model=ldamodel,
                        texts=collection['collection'],
                        dictionary=collection['dictionary'],
                        coherence='c_v')
                    coherence = cm.get_coherence()
                    if coherence > highest['coherence']:
                        highest = {
                            'lda': copy.deepcopy(ldamodel),
                            'num_topic': num,
                            'coherence': coherence
                        }
                    elif ((highest['coherence'] - coherence) > 0.2) \
                            or num >= 20:
                        break

                self.ldamodels.append(highest['lda'])
def coherence_cv(texts_file):
    #texts sind tokenized texts
    import csv
    texts = []
    with open(texts_file, newline='', encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile, delimiter=';', quotechar='|')
        for i in reader:
            texts.append(i)
    topicanzahl = []
    coherence10 = []
    coherence20 = []
    for i in range(min, max + 1, step):
        topicanzahl.append(i)
        model = models.LdaModel.load("./Topic_Modeling/Models/Topic_Model_%i" %
                                     i)
        c_v_10 = models.CoherenceModel(texts=texts,
                                       model=model,
                                       dictionary=dictionary,
                                       coherence='c_v',
                                       topn=10,
                                       processes=1)
        c_v_10 = c_v_10.get_coherence()
        c_v_20 = models.CoherenceModel(texts=texts,
                                       model=model,
                                       dictionary=dictionary,
                                       coherence='c_v',
                                       topn=20,
                                       processes=1)
        c_v_20 = c_v_20.get_coherence()
        coherence10.append(c_v_10)
        coherence20.append(c_v_20)

    top10b = pandas.DataFrame(data=coherence10, index=topicanzahl)
    top20b = pandas.DataFrame(data=coherence20, index=topicanzahl)

    pandas.DataFrame(top10b).to_csv(
        "./Topic_Modeling/Evaluation/Cv_Score_10_words.csv",
        sep=';',
        decimal=',')
    pandas.DataFrame(top20b).to_csv(
        "./Topic_Modeling/Evaluation/Cv_Score_20_words.csv",
        sep=';',
        decimal=',')
def coherence_umass(model):
    corpus1 = corpora.MmCorpus("./Topic_Modeling_Bigram/Input_Data/corpus.mm")
    dictionary1 = corpora.dictionary.Dictionary.load_from_text(
        "./Topic_Modeling_Bigram/Input_Data/dictionary.dict")
    u_mass = models.CoherenceModel(corpus=corpus1,
                                   model=model,
                                   dictionary=dictionary1,
                                   coherence='u_mass',
                                   processes=-1).get_coherence()
    return u_mass
 def score(self, docs, model=None, coherence="c_v", return_per_topic=False):
     if model == None:
         model = self.model
     # calculate coherence score (the higher the better)
     cm = models.CoherenceModel(
         model=model,
         texts=docs,
         dictionary=self.dictionary,
         coherence=coherence,
     )
     if return_per_topic:
         return cm.get_coherence(), cm.get_coherence_per_topic()
     else:
         return cm.get_coherence()
Ejemplo n.º 10
0
 def score(self, X, y=None):
     """
     Compute score reflecting how well the model has fit for the input data.
     """
     if self.scorer == 'perplexity':
         corpus_words = sum(cnt for document in X for _, cnt in document)
         subsample_ratio = 1.0
         perwordbound = self.gensim_model.bound(X, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)
         return -1 * np.exp2(-perwordbound)  # returning (-1*perplexity) to select model with minimum perplexity value
     elif self.scorer == 'u_mass':
         goodcm = models.CoherenceModel(model=self.gensim_model, corpus=X, coherence=self.scorer, topn=3)
         return goodcm.get_coherence()
     else:
         raise ValueError("Invalid value of `scorer` param supplied")
Ejemplo n.º 11
0
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, start,
                             stop, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        model = models.LsiModel(doc_term_matrix,
                                num_topics=num_topics,
                                id2word=dictionary)
        model_list.append(model)
        coherencemodel = models.CoherenceModel(model=model,
                                               texts=doc_clean,
                                               dictionary=dictionary,
                                               coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values
def cv_score(corpus, dict_, k, alpha, eta):
    lda_model = models.LdaMulticore(corpus = corpus,
                                    id2word = dict_,
                                    num_topics = k,
                                    alpha = alpha,
                                    eta = eta,
                                    random_state = 100,
                                    chunksize = 100,
                                    passes = 10,
                                    per_word_topics = True)
    coherence = models.CoherenceModel(model = lda_model,
                                      texts = texts,
                                      corpus = corpus,
                                      dictionary = dict_,
                                      coherence = "c_v") # u_mass, c_v, c_uci, c_npmi
    return coherence.get_coherence()
Ejemplo n.º 13
0
def lda_model_selection(corpus, id2word, r):
    print('Selecting LDA models...')
    model_list = []
    coherence_values = []
    for num_topics in r:
        print('Number of topics: %d' % num_topics)
        model = models.LdaModel(corpus, num_topics=num_topics, id2word=id2word,
                                alpha='auto', eta='auto', minimum_probability=0.001, passes=10)
        model_list.append(model)
        coherence_model = models.CoherenceModel(model=model, texts=Text, dictionary=dictionary, coherence='c_v')
        v = coherence_model.get_coherence()
        coherence_values.append(v)
        print('Coherence value: %f' % v)

    plt.plot(r, coherence_values)
    plt.xlabel('Num Topics')
    plt.ylabel('Coherence score')
    plt.legend('coherence_values', loc='best')
    plt.show()
Ejemplo n.º 14
0
def get_optimal_ldamodel_by_coherence_values(corpus,
                                             texts,
                                             dictionary,
                                             stop=100,
                                             start=10,
                                             step=10):
    """
    get the lsi model with optimal number of topics

    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LDA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    num_lists = range(start, stop, step)
    for num_topics in num_lists:
        # generate LDA model
        model = models.LdaModel(corpus=corpus,
                                num_topics=num_topics,
                                id2word=dictionary,
                                alpha='auto',
                                eta='auto',
                                eval_every=None)  # train model
        model_list.append(model)
        coherencemodel = models.CoherenceModel(model=model,
                                               texts=texts,
                                               dictionary=dictionary,
                                               coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    print("num_topics: %s" % str(num_lists))
    print("coherence_values: %s" % str(coherence_values))

    max_ind = np.argmax(np.array(coherence_values))
    print("opt_num_topics: %s" % num_lists[max_ind])
    return model_list[max_ind]
def main():
    reviews = []
    for filePath in  searchFiles('./Reviews/IP/'):
        review = pd.read_csv(filePath, encoding = 'utf-8')
        reviews.append(review)

    docs = pd.concat(reviews, ignore_index=True)
    docs['내용'] = docs.apply(lambda x: x['내용']*int(np.log2(2 + x['공감수'])), axis = 1)
    print('리뷰 읽기 끝')

    vect = GensimTfidfVectorizer(tokenizer=getNVM_lemma, n_gram = 2, dir_path='.')
    
    texts = vect.fit_transform(docs['내용'])
    id2word = vect.get_id2word()
    data = vect.texts

    print('벡터화 끝')  
    lda = models.LdaModel(corpus=texts, 
                        id2word=id2word, 
                        num_topics=20, 
                        update_every=1, 
                        chunksize=1000, 
                        passes=10,
                        alpha='auto',
                        eta='auto',
                        per_word_topics=False)

    topics = sorted(lda.show_topics(num_topics = 20, num_words=20, formatted=False), key=lambda x:x[0])
    
    pprint(topics)
    print('')

    for row in lda[texts][2]:
        pprint(row)

    print(lda.log_perplexity(texts))
    cm = models.CoherenceModel(model=lda, texts = data, dictionary=id2word, coherence = 'c_v')
    print(cm.get_coherence())

    return None
    def LDA(self,
            tf_vector,
            K_range,
            PASS_range,
            ITER_range,
            alpha="auto",
            eta="auto",
            seed=7571):
        self.LDA_EVAL_LIST = []
        self.LDA_MODEL_LIST = []
        for K in K_range:
            for PASSES in PASS_range:
                for ITER in ITER_range:
                    print("建模參數測試-主題數:{} PASS:{} Iter:{}".format(
                        K, PASSES, ITER))
                    np.random.seed(seed)
                    MODEL = models.LdaModel(corpus=tf_vector,
                                            id2word=self.dic,
                                            alpha=alpha,
                                            eta=eta,
                                            num_topics=K,
                                            passes=PASSES,
                                            iterations=ITER)
                    self.LDA_MODEL_LIST.append(MODEL)
                    EVAL = models.CoherenceModel(model=MODEL,
                                                 texts=self.split_list,
                                                 dictionary=self.dic,
                                                 coherence='c_v')
                    self.LDA_EVAL_LIST.append(
                        (K, PASSES, ITER, EVAL.get_coherence()))

                    # 模型比較
                    self.LDA_index, self.LDA_param = max(enumerate(
                        self.LDA_EVAL_LIST),
                                                         key=lambda x: x[1][3])
                    self.BEST_MODEL = self.LDA_MODEL_LIST[self.LDA_index]
                    print("最佳參數-主題數:{} PASS:{} Iter:{} Eval:{}".format(
                        self.LDA_param[0], self.LDA_param[1],
                        self.LDA_param[2], self.LDA_param[3]))
Ejemplo n.º 17
0
def evaluate_graph(filename, dictionary, corpus, texts, limit):
    coherence_values = {}
    lda_models = {}
    for num_topics in range(1, limit):
        lm = models.LdaModel(corpus=corpus,
                             num_topics=num_topics,
                             id2word=dictionary)
        lda_models[num_topics] = lm
        cm = models.CoherenceModel(model=lm,
                                   texts=texts,
                                   dictionary=dictionary,
                                   coherence='c_v')
        coherence_values[num_topics] = cm.get_coherence()

    x = list(coherence_values.keys())
    y = list(coherence_values.values())
    plt.plot(x, y)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    plt.savefig(filename + "_coherence-topic.pdf", bbox_inches='tight')
    return lda_models, coherence_values
def main():
    st = time.time()
    print "Start Time: ", st
    documents = get_input(input_file_path)
    p = Pool(15)
    urls = [row[0]['title'] for row in documents]
    individual_results = p.map(evaluate, documents)
    dictionary = corpora.Dictionary(individual_results)
    corpus = [dictionary.doc2bow(text) for text in individual_results]
    tfidf = gensim.models.TfidfModel(corpus)
    imp_corpus = tfidf[corpus]
    lsimodel = models.LsiModel(imp_corpus, id2word=dictionary)
    cohmodel = models.CoherenceModel(model=lsimodel,
                                     corpus=imp_corpus,
                                     coherence='u_mass')
    print 'Coherence:', cohmodel.get_coherence_per_topic()
    lsi_corpus = lsimodel[imp_corpus]
    # Use the singular values to choose how many components to use
    v = lsimodel.projection.s**2 / sum(lsimodel.projection.s**2)
    print v[:100]
    k = np.argmin(
        v > 0.005
    ) + 1  # Hard threshold, may be better to plot and find the knee
    topics = lsimodel.show_topics(num_topics=k, num_words=5)
    #topcis2 = ldamodel.get_topics()
    for i, topic in enumerate(topics):
        print topic
        tops = sorted(zip(range(len(lsi_corpus)), lsi_corpus),
                      reverse=True,
                      key=lambda doc: abs(dict(doc[1]).get(i, 0.0)))
        print 'Most relevant documents: '
        for top in tops[:10]:
            print urls[top[0]]
        print
    #print corpus[0]
    end = time.time()
    print "End Time: ", end - st
Ejemplo n.º 19
0
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
                             step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    mallet_path = "../models/mallet-2.0.8/bin/mallet"
    for num_topics in range(start, limit, step):
        model = models.wrappers.LdaMallet(mallet_path,
                                          corpus=corpus,
                                          num_topics=num_topics,
                                          id2word=dictionary)
        model_list.append(model)
        coherencemodel = models.CoherenceModel(model=model,
                                               texts=texts,
                                               dictionary=dictionary,
                                               coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
Ejemplo n.º 20
0
def optimum_topics(corpus,list_topics,dictionary,iterations,processed_content):
    temp = -10000
    temp_model = None
    topics = 0
    model_coherences = []
    model_preplexities = []
    models_tosave = []
    for i in range(0,len(list_topics)):
        lda_model = models.LdaModel(corpus=deepcopy(corpus),num_topics=list_topics[i],id2word=deepcopy(dictionary),iterations=iterations)
        models_tosave.append(lda_model)
        #model perplexity calculation
        model_preplexity = lda_model.log_perplexity(corpus)
        model_preplexities.append(model_preplexity)
        #Calculating the coherence
        topic_coherence = models.CoherenceModel(model=lda_model, texts=processed_content, dictionary=dictionary, coherence='c_v')
        model_coherence = topic_coherence.get_coherence()
        model_coherences.append(model_coherence)

        if model_coherence>=temp:
            topics = list_topics[i]
            temp = model_coherence
            temp_model = lda_model
    
    return temp_model,topics,temp,model_coherences, model_preplexities, models_tosave
Ejemplo n.º 21
0
def get_coherence_value(dictionary, doc_term_matrix, tokenized_list,
                        max_topics, processors):
    coherence_value = 0
    best_model = None
    best_num_topics = 0

    for num_topics in range(2, max_topics, 1):
        # generate LSA model
        model = models.LsiModel(doc_term_matrix,
                                num_topics=num_topics,
                                id2word=dictionary)  # train model
        coherencemodel = models.CoherenceModel(model=model,
                                               texts=tokenized_list,
                                               dictionary=dictionary,
                                               coherence='c_v',
                                               processes=processors)

        # check best model
        if (coherence_value < coherencemodel.get_coherence()):
            coherence_value = coherencemodel.get_coherence()
            best_model = model
            best_num_topics = num_topics

    return best_model, coherence_value, best_num_topics
Ejemplo n.º 22
0
corpus2=[]
with open(path2,'r',encoding='utf-8') as f:
    for line in f.readlines():
        corpus2.append(line.strip().split(' '))
    f.close()
corpus0=corpus1+corpus2
id2word=corpora.Dictionary(corpus0)
corpus = [id2word.doc2bow(text) for text in corpus0]
ldamodel=models.ldamodel.LdaModel(iterations=200,corpus=corpus,num_topics=20,id2word=id2word)
for i in range(20):
    print('第{0}个主题的信息****************8'.format(i))
    print(ldamodel.show_topic(topicid=i,topn=20))
    
op=ldamodel.get_topics()
print(op)
coh=models.CoherenceModel(ldamodel,corpus=corpus,dictionary=id2word,coherence='u_mass')

print(coh.get_coherence())
kmean=KMeans(n_clusters=5)
kmean.fit(op)
pre_kmean=kmean.predict(op)
data=pd.DataFrame(op)
plt.scatter(data[45],data[187],c=pre_kmean)
plt.show()
from scipy.cluster.hierarchy import dendrogram, linkage,fcluster
from matplotlib import pyplot as plt

Z = linkage(data, 'ward')
f = fcluster(Z,3,'distance')
fig = plt.figure(figsize=(5, 3))
dn = dendrogram(Z)
Ejemplo n.º 23
0
def run_experiment(num_iterations):
    iterations = num_iterations
    max_number_words = 10
    min_word_length = 3

    city = 'San Francisco'
    category = 'Arts & Culture'

    perplexities_name = []
    coherence_scores_name = []
    perplexities_descr = []
    coherence_scores_descr = []
    perplexities_name_descr = []
    coherence_scores_name_descr = []

    for number_words in range(max_number_words):
        perplexities_name.append(0)
        coherence_scores_name.append(0)
        perplexities_descr.append(0)
        coherence_scores_descr.append(0)
        perplexities_name_descr.append(0)
        coherence_scores_name_descr.append(0)

    for iteration in range(iterations):
        print("Iteration: " + str(iteration))
        for number_words in range(max_number_words):
            print("Num words: " + str(number_words))

            # USE NAME
            # Get model
            [clusters, lda_model, corpus, tokenized_items,
             dictionary_LDA] = get_clusters(city, category, 'event', 10,
                                            number_words + 1, min_word_length,
                                            True, False)
            # Compute perplexity of model
            perplexity = lda_model.log_perplexity(corpus)

            # Compute coherence score of model
            coherence_model_lda = models.CoherenceModel(
                model=lda_model,
                texts=tokenized_items,
                dictionary=dictionary_LDA,
                coherence='c_v')
            coherence = coherence_model_lda.get_coherence()

            # Aggregate scores
            perplexities_name[number_words] += perplexity
            coherence_scores_name[number_words] += coherence

            # USE DESCRIPTION
            # Get model
            [clusters, lda_model, corpus, tokenized_items,
             dictionary_LDA] = get_clusters(city, category, 'event', 10,
                                            number_words + 1, min_word_length,
                                            False, True)
            # Compute perplexity of model
            perplexity = lda_model.log_perplexity(corpus)

            # Compute coherence score of model
            coherence_model_lda = models.CoherenceModel(
                model=lda_model,
                texts=tokenized_items,
                dictionary=dictionary_LDA,
                coherence='c_v')
            coherence = coherence_model_lda.get_coherence()

            # Aggregate scores
            perplexities_descr[number_words] += perplexity
            coherence_scores_descr[number_words] += coherence

            # USE BOTH
            # Get model
            [clusters, lda_model, corpus, tokenized_items,
             dictionary_LDA] = get_clusters(city, category, 'event', 10,
                                            number_words + 1, min_word_length,
                                            True, True)
            # Compute perplexity of model
            perplexity = lda_model.log_perplexity(corpus)

            # Compute coherence score of model
            coherence_model_lda = models.CoherenceModel(
                model=lda_model,
                texts=tokenized_items,
                dictionary=dictionary_LDA,
                coherence='c_v')
            coherence = coherence_model_lda.get_coherence()

            # Aggregate scores
            perplexities_name_descr[number_words] += perplexity
            coherence_scores_name_descr[number_words] += coherence

    # Average the scores
    perplexities_name = [
        perplexities_name[number_words] / iterations
        for number_words in range(max_number_words)
    ]
    coherence_scores_name = [
        coherence_scores_name[number_words] / iterations
        for number_words in range(max_number_words)
    ]
    perplexities_descr = [
        perplexities_descr[number_words] / iterations
        for number_words in range(max_number_words)
    ]
    coherence_scores_descr = [
        coherence_scores_descr[number_words] / iterations
        for number_words in range(max_number_words)
    ]
    perplexities_name_descr = [
        perplexities_name_descr[number_words] / iterations
        for number_words in range(max_number_words)
    ]
    coherence_scores_name_descr = [
        coherence_scores_name_descr[number_words] / iterations
        for number_words in range(max_number_words)
    ]

    num_words = [number_words + 1 for number_words in range(max_number_words)]

    # Print graphs containing results
    plt.xlabel('Number of words')
    plt.ylabel('Perplexity')
    plt.title('Perplexity vs Number of Words in Topic')
    plt.plot(num_words, perplexities_name, label='Use name')
    plt.plot(num_words, perplexities_descr, label='Use description')
    plt.plot(num_words, perplexities_name_descr, label='Use both')
    plt.legend(loc='upper right')
    plt.savefig('perplexity.png')

    plt.clf()

    plt.xlabel('Number of words')
    plt.ylabel('Coherence score')
    plt.title('Coherence score vs Number of Words in Topic')
    plt.plot(num_words, coherence_scores_name, label='Use name')
    plt.plot(num_words, coherence_scores_descr, label='Use description')
    plt.plot(num_words, coherence_scores_name_descr, label='Use both')
    plt.legend(loc='upper right')
    plt.savefig('coherence_score.png')

    # print(perplexities)
    # print(coherence_scores)

    return
Ejemplo n.º 24
0
    coherence_values = []
    for num_topics in r:
        print('Number of topics: %d' % num_topics)
        model = models.LdaModel(corpus, num_topics=num_topics, id2word=id2word,
                                alpha='auto', eta='auto', minimum_probability=0.001, passes=10)
        model_list.append(model)
        coherence_model = models.CoherenceModel(model=model, texts=Text, dictionary=dictionary, coherence='c_v')
        v = coherence_model.get_coherence()
        coherence_values.append(v)
        print('Coherence value: %f' % v)

    plt.plot(r, coherence_values)
    plt.xlabel('Num Topics')
    plt.ylabel('Coherence score')
    plt.legend('coherence_values', loc='best')
    plt.show()


if __name__ == '__main__':
    Text = texts.Text
    dictionary = corpora.Dictionary(Text)
    corpus = [dictionary.doc2bow(text) for text in Text]
    corpus_tfidf = models.TfidfModel(corpus)[corpus]
    lda_model_selection(corpus_tfidf, dictionary, range(3, 30, 3))

    print('Getting HDP model coherence...')
    hdp = models.HdpModel(corpus_tfidf, id2word=dictionary)
    coherence_model = models.CoherenceModel(model=hdp, texts=Text, dictionary=dictionary, coherence='c_v')
    v = coherence_model.get_coherence()
    print('Coherence value: %f' % v)
Ejemplo n.º 25
0
def get_umass(corpus, num_topics, dictionary):#计算话题一致性指标
    mod = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
    cm = models.CoherenceModel(model=mod, corpus=corpus, dictionary=dictionary, coherence="u_mass")
    umass = cm.get_coherence()
    return umass
Ejemplo n.º 26
0
def main():
    st = time.time()
    print "Start Time: ", st

    documents = get_input(input_file_path)

    p = Pool(15)
    #urls = [row[0]['URL_s'] for row in documents]
    individual_results = p.map(evaluate, documents)
    dictionary = corpora.Dictionary(individual_results)
    corpus = [dictionary.doc2bow(text) for text in individual_results]
    tfidf = gensim.models.TfidfModel(corpus)
    imp_corpus = tfidf[corpus]

    #LSA
    lsimodel = models.LsiModel(imp_corpus, id2word=dictionary)

    #LDA
    # lda_model = gensim.models.ldamodel.LdaModel(corpus=imp_corpus,
    #                                        id2word=dictionary,
    #                                        num_topics=6,
    #                                        random_state=100,
    #                                        update_every=1,
    #                                        chunksize=100,
    #                                        passes=10,
    #                                        alpha='auto',
    #                                        per_word_topics=True)

    #cohmodel = models.CoherenceModel(model=lda_model, corpus=imp_corpus, coherence='u_mass')
    cohmodel = models.CoherenceModel(model=lsimodel,
                                     corpus=imp_corpus,
                                     coherence='u_mass')
    # print 'Coherence:', cohmodel.get_coherence_per_topic()
    lsi_corpus = lsimodel[imp_corpus]
    #lsi_corpus = lda_model[imp_corpus]
    # Use the singular values to choose how many components to use
    v = lsimodel.projection.s**2 / sum(lsimodel.projection.s**2)
    #print v[:100]
    #k = np.argmin(v>0.005)+1    # Hard threshold, may be better to plot and find the knee

    #At the moment just print out 15 topics
    topics = lsimodel.show_topics(num_topics=15, num_words=5)
    #topics = lda_model.show_topics(num_topics=15, num_words=5)
    #topcis2 = ldamodel.get_topics()

    for i, topic in enumerate(topics):
        print topic

    listOfDocsPerTopic = []
    for i, topic in enumerate(topics):
        articles = []
        tops = sorted(zip(range(len(lsi_corpus)), lsi_corpus),
                      reverse=True,
                      key=lambda doc: abs(dict(doc[1]).get(i, 0.0)))

        curr = tops[0][1][i][1]
        j = 0
        while (abs(curr) > 0.3 and j < len(tops)):
            top = tops[j]
            j += 1
            curr = top[1][i][1]
            articles.append(top[0])
        listOfDocsPerTopic.append(articles)

    appendArticles(listOfDocsPerTopic[0], "topic-big-0", documents)
    appendArticles(listOfDocsPerTopic[2], "topic-big-2", documents)
    appendArticles(listOfDocsPerTopic[7], "topic-big-9", documents)

    end = time.time()
    print "End Time: ", end - st
    topicanzahl_liste.append(topicanzahl)
    dateiname_model1 = "DTM_%i_Topics.model" % (topicanzahl)
    model1 = utils.SaveLoad.load(dateiname_model1)
    dateiname_corpus1 = "Korpus_2000_bis_2013.mm"
    corpus1 = corpora.MmCorpus(dateiname_corpus1)

    dateiname_dictionary1 = "Dictionary_2000_bis_2013.dict"
    dictionary1 = corpora.dictionary.Dictionary.load_from_text(
        dateiname_dictionary1)

    coherence = []
    coherence_model.append(coherence)
    for time1 in range(0, Endjahr - Anfangsjahr + 1, 1):
        topics_dtm = model1.dtm_coherence(time1)
        cm = models.CoherenceModel(topics=topics_dtm,
                                   dictionary=dictionary1,
                                   corpus=corpus1,
                                   coherence='u_mass').get_coherence()
        coherence.append(cm)

jahressequenz = []
for i in range(Anfangsjahr, Endjahr + 1, 1):
    jahressequenz.append(i)

x = pandas.DataFrame(data=coherence_model,
                     columns=jahressequenz,
                     index=topicanzahl_liste)
a = pandas.DataFrame.transpose(x)
dateiname_evaluation = "DTM_Bigramm_Evaluation_%i_%i.csv" % (start, end)
pandas.DataFrame(data=a).to_csv(dateiname_evaluation, sep=';')
print("Benötigte Zeit: %0.3fs." % (time() - t0))
t0 = time()
min = 1
max = 100
step = 1
corpus = corpora.MmCorpus("./Topic_Modeling/Input_Data/corpus.mm")
dictionary = corpora.dictionary.Dictionary.load_from_text(
    "./Topic_Modeling/Input_Data/dictionary.dict")
topicanzahl = []
coherence10 = []
coherence20 = []
for i in range(min, max + 1, step):
    topicanzahl.append(i)
    model = models.LdaModel.load("./Topic_Modeling/Models/Topic_Model_%i" % i)
    u_mass_10 = models.CoherenceModel(corpus=corpus,
                                      model=model,
                                      dictionary=dictionary,
                                      coherence='u_mass',
                                      topn=10).get_coherence()
    u_mass_20 = models.CoherenceModel(corpus=corpus,
                                      model=model,
                                      dictionary=dictionary,
                                      coherence='u_mass',
                                      topn=20).get_coherence()
    coherence10.append(u_mass_10)
    coherence20.append(u_mass_20)

top10a = pandas.DataFrame(data=coherence10, index=topicanzahl)
top20a = pandas.DataFrame(data=coherence20, index=topicanzahl)

pandas.DataFrame(top10a).to_csv(
    "./Topic_Modeling/Evaluation/UMass_Score_10_words.csv",
Ejemplo n.º 29
0
    )
    print(res)
    print(f"Execution time: {(time.time() - start_time)/60} mins")

#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#12buildingthetopicmodel
visualize_topics(lda_model, corpus, id2word, cv)

# Compute Coherence Score
#TODO: test why it returns nan
d = corpora.Dictionary()
word2id = dict((k, v) for k, v in cv.vocabulary_.items())
d.id2word = id2word
d.token2id = word2id

coherence_model_lda = models.CoherenceModel(model=lda_model,
                                            texts=list(df['text']),
                                            dictionary=d,
                                            coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"Coherence Score: {coherence_lda}")

#trick: look at nouns only; by default it looks at all words as being the same
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html


def get_nouns(text):
    '''
  Given a string of text, tokenize the text and pull out only the nouns.
  '''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized_text = word_tokenize(text)
    all_nouns = [
Ejemplo n.º 30
0
top10 = sort_sims[:10]
top10doc = [texts[j[0]] for j in top10]

print(top10doc)

############################
# Train lda model
lda = models.LdaModel(corpus_tfidf, id2word=dct, num_topics=100)

# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(corpus_tfidf))
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = models.CoherenceModel(model=lda,
                                            texts=texts,
                                            dictionary=dct,
                                            coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# =============================================================================
# # Visualize the topics
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda, corpus_tfidf, dct)
# vis
# =============================================================================

###########################
# Cross-validation of LDA
sentiment = df['Recommended IND']
#