Ejemplo n.º 1
0
def explore(parameters, run):
    print(parameters)
    no_above = parameters["no_above"]
    chunksize = parameters["chunksize"]
    passes = parameters["passes"]
    iterations = parameters["iterations"]
    size = parameters["size"]
    num_topics = parameters["num_topics"]

    with open(fname, 'a', newline='', encoding='utf-8') as csv_file:
        run += 1
        print("Run " + str(run) + " out of " + str(runs))
        writer = csv.writer(csv_file)
        corpora.Dictionary.filter_extremes(dictionary,
                                           no_below=no_below,
                                           no_above=no_above,
                                           keep_tokens=None)
        corpus = [dictionary.doc2bow(review) for review in reviews]
        corpora.MmCorpus.serialize(name + '.mm', corpus)
        mm = corpora.MmCorpus(
            name + '.mm')  # `mm` document stream now has random access
        mm_used = mm[:size]
        writer.writerows([[
            "Data size", "Topics", "no_above", "Chunksize", "Passes",
            "Iteration"
        ], [size, num_topics, no_above, chunksize, passes, iterations], []])

        lda = LdaModel(mm_used,
                       num_topics=num_topics,
                       chunksize=chunksize,
                       id2word=dictionary,
                       passes=passes,
                       iterations=iterations,
                       eval_every=eval_every)

        lst = []
        for topic in LdaModel.print_topics(lda, -1, 10):
            terms = [
                x[0] for x in LdaModel.get_topic_terms(lda, topic[0], topn=10)
            ]
            term_strings = [str(dictionary[term]) for term in terms]
            str_topic = []
            str_topic.append("Topic " + str(topic[0] + 1))
            str_topic.extend(term_strings)
            lst.append(str_topic)

        writer.writerows(zip(*lst))
        writer.writerow([])

        return run
Ejemplo n.º 2
0
class MyLda:
    def __init__(self, myDictionary, num_topics=100, topic_threshold=0.15):
        self.num_topics = num_topics
        self.topic_threshold = topic_threshold
        self.myDictionary = myDictionary
        self.model = LdaModel(self.myDictionary.doc2bows, \
         id2word=self.myDictionary.dictionary, \
         num_topics=num_topics)
        self.topic2ids, self.id2topics = self.get_mappings()
        self.coherenceModel = None
        print("- Created MyLda with {} topics".format(self.num_topics))

    def get_mappings(self):
        topic2ids, id2topics = defaultdict(list), defaultdict(list)
        for i, doc2bow in enumerate(self.myDictionary.doc2bows):
            topic_pairs = self.model.get_document_topics(doc2bow)
            for j, (topic, prob) in enumerate(topic_pairs):
                if prob >= self.topic_threshold or j == 0:
                    topic2ids[topic].append(i)
                    id2topics[i].append(topic)
        return topic2ids, id2topics

    def get_topic_terms(self, topic):
        terms = self.model.get_topic_terms(topic)
        return terms

    def get_top_topic(self):
        top_topics = self.model.top_topics(corpus=self.myDictionary.doc2bows)
        average = sum([t[1] for t in top_topics]) / self.num_topics
        return top_topics, average

    def get_perplexity(self):
        return self.model.log_perplexity(self.myDictionary.doc2bows)

    def get_coherence(self):
        if not self.coherenceModel:
            self.coherenceModel = CoherenceModel(model=self.model, \
             corpus=self.myDictionary.doc2bows, \
             dictionary=self.myDictionary.dictionary, \
             coherence='u_mass')
        return self.coherenceModel.get_coherence()
Ejemplo n.º 3
0
def lda_topics(processed_data: list, n_topics: int = 10, learning_decay: float = 0.5,
               learning_offset: float = 1.0, max_iter: int = 50, n_words: int = 10) -> Tuple[list, list]:
    """
    lda_topics perfoms LDA topic modeling on the input data

    :param processed_data: list of preprocessed segments
    :param n_topics: number of topics to extract form corpus
    :param learning_decay: learning decay parameter for LDA
    :param learning_offset: learning offset parameter for LDA
    :param max_iter: max. number of interations
    :param n_words: number of topic representatives

    :return:
        - topics - list of topics (and their representatives
        - doc_topics - list of predicted topics, one for each segment
    """

    dictionary = corpora.Dictionary(processed_data, )
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_data]

    lda_model = LdaModel(doc_term_matrix, id2word=dictionary, num_topics=n_topics, offset=learning_offset,
                         random_state=42, update_every=1, iterations=max_iter,
                         passes=10, alpha='auto', eta="auto", decay=learning_decay, per_word_topics=True)

    topics = []
    for i_t, topic_word_dist in enumerate(lda_model.get_topics()):
        topic = [lda_model.id2word[w_id] for w_id, _ in lda_model.get_topic_terms(i_t, topn=n_words)]
        topics.append(topic)

    # getting documents topic labels
    doc_topics = []
    for doc in doc_term_matrix:

        doc_t_dist = sorted(lda_model.get_document_topics(doc), key=lambda item: item[1], reverse=True)
        t, _ = doc_t_dist[0]
        doc_topics.append(t)

    assert len(doc_topics) == len(processed_data)
    return topics, doc_topics
Ejemplo n.º 4
0
                            ],
                                              [
                                                  size, num_topics, no_above,
                                                  chunksize, passes, iterations
                                              ], []])

                            lda = LdaModel(mm_used,
                                           num_topics=num_topics,
                                           chunksize=chunksize,
                                           id2word=dictionary,
                                           passes=passes,
                                           iterations=iterations,
                                           eval_every=eval_every)

                            lst = []
                            for topic in LdaModel.print_topics(lda, -1, 10):
                                terms = [
                                    x[0] for x in LdaModel.get_topic_terms(
                                        lda, topic[0], topn=10)
                                ]
                                term_strings = [
                                    str(dictionary[term]) for term in terms
                                ]
                                str_topic = []
                                str_topic.append("Topic " + str(topic[0] + 1))
                                str_topic.extend(term_strings)
                                lst.append(str_topic)

                            writer.writerows(zip(*lst))
                            writer.writerow([])
Ejemplo n.º 5
0
    random_state=100,
    num_topics=10,
    passes=5,
    chunksize=10000,
    alpha='asymmetric',
    decay=0.5,
    offset=64,
    eta=None,
    eval_every=0,
    iterations=100,
    gamma_threshold=0.001,
    per_word_topics=True)

## See the topics
lda_model.print_topics(-1)  #this allows to observe the topics
lda_model.get_topic_terms(0,
                          topn=10)  # this provides the top 10 words in topic 0
lda_model.log_perplexity(corpus)  # this compute the log perplexity
lda_model.get_document_topics(
    corpus[0]
)  # This provide the document topic distribution. Note that by default, when a document has a low probability on a topic, it is not displayed
lda_model.get_document_topics(
    corpus[0], minimum_probability=0
)  # This provide the document topic distribution. Here, every topics and associated probabilities are printed.
### Document topic
####
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.show(vis)
Ejemplo n.º 6
0
def build_lda_model(tokens_tags,
                    pos_tags,
                    use_nouns=True,
                    use_verbs=True,
                    use_all=False,
                    num_of_topics=8,
                    passes=25,
                    verbose=True):
    path = os.getcwd()[:os.getcwd().rfind('/')]
    topics_filename = str(num_of_topics) + "topics"
    if use_nouns:
        topics_filename += "_nouns"
    if use_verbs:
        topics_filename += "_verbs"
    if use_all:
        topics_filename += "_all"

    # Set the LDA, Dictionary and Corpus filenames
    lda_filename = path + "/models/topic_models/lda_" + topics_filename + ".model"
    dict_filename = path + "/res/topic_data/dict/dict_" + topics_filename + ".dict"
    corpus_filename = path + "/res/topic_data/corpus/corpus_" + topics_filename + ".mm"

    # Build a topic model if it wasn't created yet
    if not os.path.exists(lda_filename):
        # Extract the lemmatized documents
        docs = []
        for index in range(len(tokens_tags)):
            tokens = tokens_tags[index].split()
            pos = pos_tags[index].split()
            docs.append(
                data_proc.extract_lemmatized_tweet(tokens, pos, use_verbs,
                                                   use_nouns, use_all))

        # Compute the dictionary and save it
        dictionary = Dictionary(docs)
        dictionary.filter_extremes(keep_n=40000)
        dictionary.compactify()
        Dictionary.save(dictionary, dict_filename)

        # Compute the bow corpus and save it
        corpus = [dictionary.doc2bow(d) for d in docs]
        MmCorpus.serialize(corpus_filename, corpus)

        if verbose:
            print("\nCleaned documents:", docs)
            print("\nDictionary:", dictionary)
            print("\nCorpus in BoW form:", corpus)

        # Start training an LDA Model
        start = time.time()
        print("\nBuilding the LDA topic model...")
        lda_model = LdaModel(corpus=corpus,
                             num_topics=num_of_topics,
                             passes=passes,
                             id2word=dictionary)
        lda_model.save(lda_filename)
        end = time.time()
        print("Completion time for building LDA model: %.3f s = %.3f min" %
              ((end - start), (end - start) / 60.0))

        if verbose:
            print("\nList of words associated with each topic:")
            lda_topics = lda_model.show_topics(formatted=False)
            lda_topics_list = [[word for word, prob in topic]
                               for topic_id, topic in lda_topics]
            print([t for t in lda_topics_list])

    # Load the previously saved dictionary
    dictionary = Dictionary.load(dict_filename)

    # Load the previously saved corpus
    mm_corpus = MmCorpus(corpus_filename)

    # Load the previously saved LDA model
    lda_model = LdaModel.load(lda_filename)

    # Print the top 10 words for each topic
    if verbose:
        for topic_id in range(num_of_topics):
            print("\nTop 10 words for topic ", topic_id)
            print([
                dictionary[word_id]
                for (word_id,
                     prob) in lda_model.get_topic_terms(topic_id, topn=10)
            ])

    index = 0
    if verbose:
        for doc_topics, word_topics, word_phis in lda_model.get_document_topics(
                mm_corpus, per_word_topics=True):
            print('Index ', index)
            print('Document topics:', doc_topics)
            print('Word topics:', word_topics)
            print('Phi values:', word_phis)
            print('-------------- \n')
            index += 1
    return dictionary, mm_corpus, lda_model
def run_lda_with_entropy(industry_lda, token_dict, max_k=5):
    common_dictionary = corpora.Dictionary(industry_lda)
    common_corpus = [common_dictionary.doc2bow(text) for text in industry_lda]
    ldamodel = LdaModel(corpus=common_corpus, num_topics=max_k + 1, id2word=common_dictionary)
    result = ldamodel.print_topics(num_topics=max_k + 1, num_words=10)
    center_lst = []
    for i in range(max_k + 1):
        result2 = ldamodel.get_topic_terms(topicid=i)
        sum_word = 0
        center = 0
        length = len(result2)
        for v in result2:
            if common_dictionary[v[0]] in token_dict.keys():
                center += token_dict[common_dictionary[v[0]]]
        center_lst.append(center / length)

    industry_with_center_distance = []
    sum_temp5_lst = []
    for i in industry_lda:
        temp2 = []
        for k in i:
            temp = []
            if k in token_dict.keys():
                for j in center_lst:
                    temp.append((cal_sim(np.array(token_dict[k]), j)))
            if len(temp) > 0:
                temp2.append(temp)
        if len(temp2) > 0:
            temp3 = np.array(temp2)
            temp4 = np.mean(temp3, axis=0)
            temp5 = np.sum(temp3)
        else:
            temp4 = []
            for i in range(0, max_k + 1):
                temp4.append(0.0)
            temp5 = temp4

        industry_with_center_distance.append(temp4)
        sum_temp5_lst.append(temp5)

    entro_result_final = {}

    for number, i in enumerate(industry_lda):
        entro_result_2 = []
        for k in i:
            entro_result = []
            if k in token_dict.keys():
                for j in center_lst:
                    temp = cal_sim(np.array(token_dict[k]), j)
                    temp_value = temp / sum_temp5_lst[number]
                    entro_result.append(temp_value * math.log(temp_value))
            entro_result_2.append(entro_result)
        if len(entro_result_2) > 0:
            temp5 = np.zeros(shape=(1, max_k + 1), dtype=float)
            for w in entro_result_2:
                if len(w) > 0:
                    temp4 = np.array(w)
                    temp5 += temp4

            list_temp5 = list(temp5[0])
            entro_result_final[number] = list_temp5.index(max(list_temp5))

    final_result = defaultdict(list)
    for i in range(0, max_k + 1):
        for key, value in entro_result_final.items():
            if value == i:
                final_result[i].append(industry_lda[key])
    model = LdaModel(
        corpus=corpus,
        id2word=dictionary.id2token,
        chunksize=1000,
        alpha='asymmetric',
        eta='auto',
        iterations=iterations,
        num_topics=args.num_topics,
        passes=passes,
        eval_every=None
    )

    topic_tokens = []
    for topicid in range(args.num_topics):
        topic_tokens.append([dictionary.id2token[k[0]] for i, k in enumerate(model.get_topic_terms(topicid, topn=4)) if i < 2 or k[1] > 0.025])

    paper_topic_data = []
    for paper, paper_bow in zip(data, corpus):
        topic_distr = model.get_document_topics(paper_bow, minimum_probability=0)
        paper_topic_data.append({
            "key": paper["key"],
            "year": paper["year"],
            "title": paper["title"],
            "topic_distr": {t: float(p) for t, p in topic_distr}
        })

    with open(args.outpath, 'w') as f:
        json.dump({
            "topics": topic_tokens,
            "paper_data": paper_topic_data 
Ejemplo n.º 9
0
#   minimum_phi_value (float) – if per_word_topics is True, this represents a lower bound on the term probabilities that are included (None by default). If set to None, a value of 1e-8 is used to prevent 0s.
# Returns:
#   topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples.
test = dct.doc2bow("I love Kitten".lower().strip().split())
print(lda.get_document_topics(test))
print(lda[test])

# 参数(word_id, minimum_probability=None)
# 关联的topics for the given word.
# Each topic is represented as a tuple of (topic_id, term_probability).
print(lda.get_term_topics(0))

# ----- 输出指定topic的构成 -----
# 参数(word_id, minimum_probability=None)
# 输出形式 list, format: [(word, probability), … ].
print(lda.get_topic_terms(0))
# 参数(topicno, topn=10)
print(lda.show_topic(0))
# 输出形式 String, format: ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘.
# 参数(topicno, topn=10)
print(lda.print_topic(0))

# ----- 输出所有topic的构成 -----
# 默认参数(num_topics=10, num_words=10, log=False, formatted=True)
# 输出形式 String, format: [(0, ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘), ...]
print(lda.show_topics())
# [num_topics, vocabulary_size] array of floats (self.dtype)
# which represents the term topic matrix learned during inference.
print(lda.get_topics())

# ----- save and load model -----
Ejemplo n.º 10
0
#-*- coding: utf-8 -*-
import pickle

from gensim.corpora import Dictionary
from gensim.models import LdaModel

with open("../data/corpus_test.pkl", "rb") as f:
    corpus = pickle.load(f)

corpus_dictionary = Dictionary(corpus)
corpus = [corpus_dictionary.doc2bow(text) for text in corpus]

CORPUS = corpus
TOPIC_NUM = 10
lda = LdaModel(corpus=CORPUS, num_topics=TOPIC_NUM)

doc_topic_matrix = lda.get_document_topics([(0, 1), (1, 1)])
term_topic_matrix = lda.get_term_topics(1)
topic_term_matrix = lda.get_topic_terms(1)
Ejemplo n.º 11
0
raw_texts = methods.load_data(data_file)

processed_texts = methods.clean_data(raw_texts)

dictionary, corpus = methods.get_dict(processed_texts)

## create LDA model
ldamodel = LdaModel(corpus,
                    id2word=dictionary,
                    num_topics=NUMTOPICS,
                    passes=10)

# examine learned topics
topics_list = []
for topic_ind in range(NUMTOPICS):
    topic = ldamodel.get_topic_terms(topic_ind, NUMTERMS)
    topics_list.append([dictionary[pair[0]] for pair in topic])
    print("Topic", topic_ind, ":", topics_list[topic_ind])

# average coherence of the learned topics
#since we filtered the dictionary, some words in the processed texts are not in the dictionary. We will create a new dictionary for coherence use only
dictionary_coh = Dictionary(processed_texts)
coh = CoherenceModel(topics=topics_list,
                     texts=processed_texts,
                     dictionary=dictionary_coh,
                     coherence=coh_metric).get_coherence()
print("-" * 10)
# Coherence will be small since the data we are using here is small and will not produce representative topics.
print("(Ranked using Rank_orig) Topics Coherence Score %r %r \n" %
      (coh_metric, coh))
import networkx as nx
import numpy as np
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

g = nx.Graph()
g.add_edges_from([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3], [4, 5],
                  [4, 6], [4, 7], [5, 6], [5, 7], [6, 7], [3, 4]])

sentences = []
for node in g.nodes():
    sentences.append([str(nb) for nb in g.neighbors(node)])

# Create a corpus from a list of texts
common_dictionary = Dictionary(sentences)
common_corpus = [common_dictionary.doc2bow(text) for text in sentences]

lda = LdaModel(common_corpus, num_topics=2, eta=0.001, alpha=[0.001, 0.001])

s = lda.get_topic_terms(topicid=0, topn=g.number_of_nodes())

token2id = common_dictionary.token2id

id2node = {token2id[token]: token for token in token2id}

print(s)
print([(id2node[p[0]], p[1]) for p in s])
Ejemplo n.º 13
0
def lda_train(p_generate, theta_generate, phi_generate, num_topics, num_docs):
    import matplotlib.pyplot as plt
    from gensim.models import LdaModel, LdaMulticore
    import gensim.downloader as api
    from gensim.utils import simple_preprocess, lemmatize
    import nltk
    from nltk.corpus import stopwords
    from gensim import corpora
    import re
    import pyLDAvis
    import logging
    import numpy as np
    import scipy
    import sys
    from itertools import permutations
    from gensim.models import CoherenceModel
    np.set_printoptions(threshold=sys.maxsize)
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    if __name__ == '__main__':
        __spec__ = None
    #Load dictionnary and corpus
    dct = corpora.Dictionary.load('dct.dict')
    corpus = corpora.MmCorpus('corpus.mm')
    num_words = len(dct)
    # Step 4: Train the LDA model
    lda_model = LdaModel(corpus=corpus,
                         id2word=None,
                         num_topics=num_topics,
                         random_state=100,
                         update_every=1,
                         chunksize=100,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True,
                         minimum_probability=0)

    # save the model
    lda_model.save('lda_model.model')

    # See the topics
    i = 0
    theta_matrix = np.zeros((num_docs, num_topics))
    for c in lda_model[corpus]:
        print(i)
        print("Document Topics      : ", c[0])  # [(Topics, Perc Contrib)]
        for j in range(theta_matrix.shape[1]):
            theta_matrix[i, j] = c[0][j][1]
        i = i + 1

    #    print("Word id, Topics      : ", c[1][:])  # [(Word id, [Topics])]
    #print("Phi Values (word id) : ", c[2][:])  # [(Word id, [(Topic, Phi Value)])]
    #    print("Word, Topics         : ", [(dct[wd], topic) for wd, topic in c[1][:]])   # [(Word, [Topics])]
    #    print("Phi Values (word)    : ", [(dct[wd], topic) for wd, topic in c[2][:]])  # [(Word, [(Topic, Phi Value)])]
    #    print("------------------------------------------------------\n")

    for j in range(num_topics):
        print("Topic", j)
        for i in range(len(lda_model.get_topic_terms(j, 10))):
            print(dct[lda_model.get_topic_terms(j, 10)[i][0]],
                  lda_model.get_topic_terms(j, 10)[i][1])

    phi_matrix = lda_model.get_topics()
    row_sums = theta_matrix.sum(axis=1)
    theta_matrix_new = theta_matrix / row_sums[:, np.newaxis]
    p = np.matmul(theta_matrix_new, phi_matrix)
    p_logit = scipy.special.logit(p)

    for i in range(p_logit.shape[0]):
        print(i)
        print(p_logit[i, ])
    p_logit_generate = np.load('p_logit_generate.npy')
    p_generate = np.load('p_generate.npy')
    theta_generate = np.load('theta_generate.npy')
    phi_generate = np.load('phi_generate.npy')
    corr_p = np.zeros((1, num_docs))
    corr_p_logit = np.zeros((1, num_docs))
    cosine_p = np.zeros((1, num_docs))
    for i in range(p_logit.shape[0]):
        corr_p_logit[0, i] = np.corrcoef(p_logit[i, ],
                                         p_logit_generate[i, ])[1, 0]
        corr_p[0, i] = np.corrcoef(p[i, ], p_generate[i, ])[1, 0]
        cosine_p[0, i] = scipy.spatial.distance.cosine(p[i, ], p_generate[i, ])
    corr_avg_p_inter = np.mean(corr_p)
    cosine_avg_p_inter = np.mean(cosine_p)
    corr_avg_p_logit_inter = np.mean(corr_p_logit)
    corr_avg_p_wordDist = np.mean(
        np.corrcoef(p)
    )  #Average of the correlation matrix for the word distributions of each documents (shape numDocxnumDoc)
    corr_avg_p_docDist = np.mean(
        np.corrcoef(np.transpose(p))
    )  #Average of the correlation matrix for the document distributions for each words (shape dictLenxdictLen)

    corr_avg_pgenerate_wordDist = np.mean(
        np.corrcoef(p_generate)
    )  #Average of the correlation matrix for the word distributions of each documents (shape numDocxnumDoc)
    corr_avg_pgenerate_docDist = np.mean(
        np.corrcoef(np.transpose(p_generate))
    )  #Average of the correlation matrix for the document distributions for each words (shape dictLenxdictLen)

    theta = theta_matrix_new
    phi = phi_matrix

    #This section is to compile to correlation and cosine of each column arrangment combination of a 3 topic model (theta_matrix)
    compilation_corr_theta = []
    compilation_cosine_theta = []
    compilation_corr_phi = []
    compilation_cosine_phi = []
    compilation_KL_theta = []
    compilation_KL_phi = []

    l = list(permutations(range(1, num_topics + 1)))

    for combi in range(len(l)):
        v_theta = np.zeros([num_docs, num_topics])
        v_phi = np.zeros([num_topics, num_words])
        for tid in range(num_topics):
            v_theta[:, tid] = theta[:, l[combi][tid] - 1]
            v_phi[tid, :] = phi[l[combi][tid] - 1, :]
        corr_theta = np.zeros((1, num_docs))
        cosine_theta = np.zeros((1, num_docs))
        KL_theta = np.zeros((1, num_docs))
        corr_phi = np.zeros((1, num_topics))
        cosine_phi = np.zeros((1, num_topics))
        KL_phi = np.zeros((1, num_topics))

        for i in range(theta_generate.shape[0]):
            corr_theta[0, i] = np.corrcoef(v_theta[i, :],
                                           theta_generate[i, :])[1, 0]
            cosine_theta[0, i] = scipy.spatial.distance.cosine(
                v_theta[i, :], theta_generate[i, :])
            KL_theta[0, i] = scipy.stats.entropy(theta_generate[i, :],
                                                 v_theta[i, :])
        compilation_corr_theta.append(corr_theta.mean())
        compilation_cosine_theta.append(cosine_theta.mean())
        compilation_KL_theta.append(KL_theta.mean())
        for i in range(phi_generate.shape[0]):
            corr_phi[0, i] = np.corrcoef(v_phi[i, :], phi_generate[i, :])[1, 0]
            cosine_phi[0, i] = scipy.spatial.distance.cosine(
                v_phi[i, :], phi_generate[i, :])
            KL_phi[0, i] = scipy.stats.entropy(phi_generate[i, :], v_phi[i, :])
        compilation_corr_phi.append(corr_phi.mean())
        compilation_cosine_phi.append(cosine_phi.mean())
        compilation_KL_phi.append(KL_phi.mean())

    compilation_cosine_phi = np.array(compilation_cosine_phi)
    compilation_corr_phi = np.array(compilation_corr_phi)
    compilation_KL_phi = np.array(compilation_KL_phi)
    compilation_cosine_theta = np.array(compilation_cosine_theta)
    compilation_corr_theta = np.array(compilation_corr_theta)
    compilation_KL_theta = np.array(compilation_KL_theta)

    alignment = compilation_KL_phi.argmin()
    if alignment != compilation_cosine_phi.argmin(
    ) | alignment != compilation_cosine_theta.argmin(
    ) | alignment != compilation_corr_theta.argmax(
    ) | alignment != compilation_corr_phi.argmax(
    ) | alignment != compilation_KL_theta.argmin():
        print('Warning!!! The alignments are not coherents.')

    #Determining the final correlation and cosine values
    v_theta = np.zeros([num_docs, num_topics])
    v_phi = np.zeros([num_topics, num_words])
    for tid in range(num_topics):
        v_theta[:, tid] = theta[:, l[alignment][tid] - 1]
        v_phi[tid, :] = phi[l[alignment][tid] - 1, :]
    corr_theta = np.zeros((1, num_docs))
    cosine_theta = np.zeros((1, num_docs))
    KL_theta = np.zeros((1, num_docs))
    corr_phi = np.zeros((1, num_topics))
    cosine_phi = np.zeros((1, num_topics))
    KL_phi = np.zeros((1, num_topics))
    for i in range(theta_generate.shape[0]):
        corr_theta[0, i] = np.corrcoef(v_theta[i, :], theta_generate[i, :])[1,
                                                                            0]
        cosine_theta[0, i] = scipy.spatial.distance.cosine(
            v_theta[i, :], theta_generate[i, :])
        KL_theta = scipy.stats.entropy(theta_generate[i, :], v_theta[i, :])
    for i in range(phi_generate.shape[0]):
        corr_phi[0, i] = np.corrcoef(v_phi[i, :], phi_generate[i, :])[1, 0]
        cosine_phi[0,
                   i] = scipy.spatial.distance.cosine(v_phi[i, :],
                                                      phi_generate[i, :])
        KL_phi = scipy.stats.entropy(phi_generate[i, :], v_phi[i, :])
    corr_theta = corr_theta.mean()
    cosine_theta = cosine_theta.mean()
    KL_theta = KL_theta.mean()
    corr_phi = corr_phi.mean()
    cosine_phi = cosine_phi.mean()
    KL_phi = KL_phi.mean()
    words_id = np.arange(num_words)
    #coherence_model_lda=CoherenceModel(model=lda_model,texts=corpus,dictionary=dct,coherence='c_v')
    #coherence_lda=coherence_model_lda.get_coherence()
    #print('\nCoherence Score: ', coherence_lda)
    return (v_phi, corr_theta, corr_phi, cosine_theta, cosine_phi, KL_theta,
            KL_phi)