Ejemplo n.º 1
0
    def testMallet2Model(self):
        if not self.mallet_path:
            return

        tm1 = ldamallet.LdaMallet(self.mallet_path,
                                  corpus=corpus,
                                  num_topics=2,
                                  id2word=dictionary)
        tm2 = ldamallet.malletmodel2ldamodel(tm1)

        # set num_topics=-1 to exclude random influence
        self.assertEqual(tm1.show_topics(-1, 10), tm2.show_topics(-1, 10))

        for document in corpus:
            element1_1, element1_2 = tm1[document][0]
            element2_1, element2_2 = tm2[document][0]
            self.assertAlmostEqual(element1_1, element2_1)
            self.assertAlmostEqual(element1_2, element2_2, 1)
            element1_1, element1_2 = tm1[document][1]
            element2_1, element2_2 = tm2[document][1]
            self.assertAlmostEqual(element1_1, element2_1)
            self.assertAlmostEqual(element1_2, element2_2, 1)
            logging.debug('%d %d', element1_1, element2_1)
            logging.debug('%d %d', element1_2, element2_2)
            logging.debug('%s %s', tm1[document][1], tm2[document][1])
Ejemplo n.º 2
0
 def create_document_topic_df(self, model=None, no_topics=10):
     """
     Creates a dataframe containing the the result of the LDA model for each document. Will set the topic with the
     highest share within the document as the dominant topic.
     :param model: LDA model to use for the calculation of the topic distribution of each document.
     :param no_topics: Number of topics in case no LDA model is provided.
     """
     if model is None:
         model = self.lda_model
     if isinstance(model, LdaMallet):
         model = malletmodel2ldamodel(model)
     topic_result_list = []
     for doc in model.get_document_topics(bow=self.bag_of_words):
         temp_dict = {}
         for topic, probability in doc:
             temp_dict[topic] = probability
         topic_result_list.append(temp_dict)
     self.result_df = pd.DataFrame(data=topic_result_list,
                                   columns=range(model.num_topics))
     self.result_df = self.result_df.fillna(0)
     if self.document_ids is not None and not self.language_detection:
         self.result_df.index = self.document_ids
     elif self.document_ids is not None and self.language_detection:
         raise Warning(
             "Using document ids and language detection together is not implemented (yet)."
         )
     dominant_topic = np.argmax(self.result_df.values, axis=1)
     self.result_df['dominant_topic'] = dominant_topic
    def topic_modelling(data_object_name):
        """
        perform topic modelign for a given set of posts (data object)
        :param data_object_name: raw data for topic modeling
        """
        data_words = Serialization.load_obj(data_object_name)

        stop_words = stopwords.words('english')
        print('removing stopwords and unfrequent words...')
        ranks = Serialization.load_obj('dict.ranks')
        data_words = Utils.remove_noncontent_words(data_words, stop_words,
                                                   ranks)

        id2word = corpora.Dictionary(data_words)
        corpus = [id2word.doc2bow(post) for post in data_words]

        topics = CS_TOPICS
        print('performing topic modeling with', topics, 'topics')
        ldamodel = LdaMallet(mallet_path,
                             corpus=corpus,
                             num_topics=topics,
                             id2word=id2word)
        pprint(
            malletmodel2ldamodel(ldamodel).top_topics(corpus, data_words,
                                                      id2word))
        '''
 def load(self, model_file):
     """
     Loads a LDA model from a given file
     :param model_file: the file which contains the model, which should be loaded
     """
     self.ldamodel = gensim.models.wrappers.LdaMallet.load(model_file)
     from gensim.models.wrappers.ldamallet import malletmodel2ldamodel
     self.ldamodel = malletmodel2ldamodel(self.ldamodel)
     print(self.ldamodel.__dict__)
Ejemplo n.º 5
0
 def testMallet2Model(self):
     if not self.mallet_path:
         return
     passed = False
     tm1 = ldamallet.LdaMallet(self.mallet_path, corpus=corpus, num_topics=2, id2word=dictionary)
     tm2 = ldamallet.malletmodel2ldamodel(tm1)
     for document in corpus:
         self.assertEqual(tm1[document][0], tm2[document][0])
         self.assertEqual(tm1[document][1], tm2[document][1])
         logging.debug('%d %d', tm1[document][0], tm2[document][0])
         logging.debug('%d %d', tm1[document][1], tm2[document][1])
Ejemplo n.º 6
0
 def testMallet2Model(self):
     if not self.mallet_path:
         return
     passed = False
     tm1 = ldamallet.LdaMallet(self.mallet_path, corpus=corpus, num_topics=2, id2word=dictionary)
     tm2 = ldamallet.malletmodel2ldamodel(tm1)
     for document in corpus:
         self.assertEqual(tm1[document][0], tm2[document][0])
         self.assertEqual(tm1[document][1], tm2[document][1])
         logging.debug('%d %d', tm1[document][0], tm2[document][0])
         logging.debug('%d %d', tm1[document][1], tm2[document][1])
    def testMallet2ModelOn20NewsGroups(self):
        corpus = [simple_preprocess(doc["data"]) for doc in api.load("20-newsgroups")]
        dictionary = Dictionary(corpus)

        corpus = [dictionary.doc2bow(text) for text in corpus]

        lda_mallet_model = ldamallet.LdaMallet(
            self.mallet_path, corpus=corpus,
            num_topics=20, id2word=dictionary, iterations=500)

        lda_gensim_model = ldamallet.malletmodel2ldamodel(lda_mallet_model, iterations=1000)
        self.assertEqual(lda_mallet_model.show_topics(20, 50), lda_gensim_model.show_topics(20, 50))
Ejemplo n.º 8
0
def train_model(num_topics, documents):

    # documents = get_dictionary()
    dictionary = corpora.Dictionary(documents)
    max_tokens = len(dictionary.keys())
    # print(f'Num tokens before cleanup {len(dictionary.keys())}')
    dictionary.filter_extremes(no_below=10, no_above=0.7, keep_n=max_tokens)
    # print(f'Num tokens after cleanup {len(dictionary.keys())}')
    corpus_bow = [dictionary.doc2bow(doc) for doc in documents]
    mallet_model = LdaMallet(mallet_path=MALLET_BINARY_PATH,
                             corpus=corpus_bow,
                             id2word=dictionary,
                             num_topics=num_topics)
    lda_model = ldamallet.malletmodel2ldamodel(mallet_model)
    return lda_model, corpus_bow, dictionary
Ejemplo n.º 9
0
    def testMallet2ModelOn20NewsGroups(self):
        corpus = [
            simple_preprocess(doc["data"]) for doc in api.load("20-newsgroups")
        ]
        dictionary = Dictionary(corpus)

        corpus = [dictionary.doc2bow(text) for text in corpus]

        lda_mallet_model = ldamallet.LdaMallet(self.mallet_path,
                                               corpus=corpus,
                                               num_topics=20,
                                               id2word=dictionary,
                                               iterations=500)

        lda_gensim_model = ldamallet.malletmodel2ldamodel(lda_mallet_model,
                                                          iterations=1000)
        self.assertEqual(lda_mallet_model.show_topics(20, 50),
                         lda_gensim_model.show_topics(20, 50))
Ejemplo n.º 10
0
    def testMallet2Model(self):
        if not self.mallet_path:
            return

        tm1 = ldamallet.LdaMallet(self.mallet_path, corpus=corpus, num_topics=2, id2word=dictionary)
        tm2 = ldamallet.malletmodel2ldamodel(tm1)
        for document in corpus:
            element1_1, element1_2 = tm1[document][0]
            element2_1, element2_2 = tm2[document][0]
            self.assertAlmostEqual(element1_1, element2_1)
            self.assertAlmostEqual(element1_2, element2_2, 1)
            element1_1, element1_2 = tm1[document][1]
            element2_1, element2_2 = tm2[document][1]
            self.assertAlmostEqual(element1_1, element2_1)
            self.assertAlmostEqual(element1_2, element2_2, 1)
            logging.debug('%d %d', element1_1, element2_1)
            logging.debug('%d %d', element1_2, element2_2)
            logging.debug('%d %d', tm1[document][1], tm2[document][1])
Ejemplo n.º 11
0
 def testMallet2Model(self):
     if not self.mallet_path:
         return
     passed = False
     tm1 = ldamallet.LdaMallet(self.mallet_path,
                               corpus=corpus,
                               num_topics=2,
                               id2word=dictionary)
     tm2 = ldamallet.malletmodel2ldamodel(tm1)
     for document in corpus:
         element1_1, element1_2 = tm1[document][0]
         element2_1, element2_2 = tm2[document][0]
         self.assertAlmostEqual(element1_1, element2_1)
         self.assertAlmostEqual(element1_2, element2_2, 1)
         element1_1, element1_2 = tm1[document][1]
         element2_1, element2_2 = tm2[document][1]
         self.assertAlmostEqual(element1_1, element2_1)
         self.assertAlmostEqual(element1_2, element2_2, 1)
         logging.debug('%d %d', element1_1, element2_1)
         logging.debug('%d %d', element1_2, element2_2)
         logging.debug('%d %d', tm1[document][1], tm2[document][1])
Ejemplo n.º 12
0
def model_mallet(clean_doc, dictionary, doc_term_matrix):

    lda_mallet = LdaMallet(mallet_path,
                           corpus=doc_term_matrix,
                           id2word=dictionary,
                           num_topics=25,
                           workers=3)
    print("Topics generated with the mallet LDA model are:\n")
    pprint(lda_mallet.show_topics(formatted=False))
    print("----------------------------------------------------")

    coherence_model_mallet = CoherenceModel(model=lda_mallet,
                                            texts=clean_doc,
                                            dictionary=dictionary,
                                            coherence='c_v')
    coherence_mallet = coherence_model_mallet.get_coherence()
    print(f"coherence score: {coherence_mallet}")

    mallet_2 = ldamallet.malletmodel2ldamodel(lda_mallet)

    return mallet_2
    def testMallet2Model(self):
        if not self.mallet_path:
            return

        tm1 = ldamallet.LdaMallet(self.mallet_path, corpus=corpus, num_topics=2, id2word=dictionary)
        tm2 = ldamallet.malletmodel2ldamodel(tm1)

        # set num_topics=-1 to exclude random influence
        self.assertEqual(tm1.show_topics(-1, 10), tm2.show_topics(-1, 10))

        for document in corpus:
            element1_1, element1_2 = tm1[document][0]
            element2_1, element2_2 = tm2[document][0]
            self.assertAlmostEqual(element1_1, element2_1)
            self.assertAlmostEqual(element1_2, element2_2, 1)
            element1_1, element1_2 = tm1[document][1]
            element2_1, element2_2 = tm2[document][1]
            self.assertAlmostEqual(element1_1, element2_1)
            self.assertAlmostEqual(element1_2, element2_2, 1)
            logging.debug('%d %d', element1_1, element2_1)
            logging.debug('%d %d', element1_2, element2_2)
            logging.debug('%s %s', tm1[document][1], tm2[document][1])
Ejemplo n.º 14
0
def get_optimum_topics(df, dictionary, doc_term_matrix, clean_doc, start,
                       limit):

    list_models, list_coherence = compute_coherence_values(
        dnary=dictionary,
        corpus=doc_term_matrix,
        texts=clean_doc,
        limit=30,
        start=2,
        step=1)

    limit = limit
    start = start
    x = range(start, limit)
    plt.plot(x, list_coherence)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence Score")
    plt.show()

    final_model = list_models[list_coherence.index(max(list_coherence))]
    final_model_use = ldamallet.malletmodel2ldamodel(final_model)

    df["topic"] = df["reviewContent"].apply(get_review_topic,
                                            args=(dictionary, final_model_use))
Ejemplo n.º 15
0
 def evaluate_pyldavis(self, model=None, use_jupyter=None):
     """
     Method for a visual evaluation of the LDA topic model using pyldavis.
     :param model: LDA model that is to be evaluated. If 'None', it will use the last model that has been saved
     within the class.
     :param use_jupyter: set how the pyldavis panel is displayed. If default (None), it will try to find out if run
     from jupyter and set the method accordingly
     :return:
     """
     if model is None:
         if self.lda_model is None:
             raise Exception(
                 "Please create a LDA model for evaluation before running this method."
             )
         model = self.lda_model
     if isinstance(model, LdaMallet):
         model = malletmodel2ldamodel(model)
     panel = pyLDAvis.gensim.prepare(model, self.bag_of_words, self.id2word)
     if use_jupyter is None:
         try:
             is_jupyter = os.environ['_'].split(
                 "/")[-1] == "jupyter-notebook"
             if is_jupyter:
                 pyLDAvis.enable_notebook()
         except KeyError:
             is_jupyter = False
         if is_jupyter:
             pyLDAvis.display(panel)
         else:
             pyLDAvis.show(panel)
     else:
         if use_jupyter:
             pyLDAvis.enable_notebook()
             pyLDAvis.display(panel)
         elif not use_jupyter:
             pyLDAvis.show(panel)
Ejemplo n.º 16
0
    def topical_differences_sig_analysis():
        """
        testing code-switching and monolingual english posts for topical differences
        (1) partition code-switched posts into two random sets
        (2) perform topic modeling of each partition and compute the similarity between the two parts and
        their individual similarity to topics extracted from monolingual posts
        (3) test the multiple-experiment similarity scores for significance
        """
        data_object_name = 'monolingual.preprocessed'

        data_words = Serialization.load_obj(data_object_name)

        stop_words = stopwords.words('english')
        print('removing stopwords and infrequent words...')
        ranks = Serialization.load_obj('dict.ranks')
        data_words = Utils.remove_noncontent_words(data_words, stop_words,
                                                   ranks)
        print('after pre-processing: total of', len(data_words), 'posts')

        topics = MONOLINGUAL_TOPICS
        for i in range(EXPERIMENTS):
            shuffle(data_words)
            part1 = data_words[:math.floor(len(data_words) / 2)]
            part2 = data_words[math.floor(len(data_words) / 2):]

            model = Utils.model_topic(part1, topics)
            Serialization.save_obj(model,
                                   'lda.mallet.monolingual.part1.' + str(i))
            print('saved topic model: part1,', i)

            model = Utils.model_topic(part2, topics)
            Serialization.save_obj(model,
                                   'lda.mallet.monolingual.part2.' + str(i))
            print('saved topic model: part2,', i)
            sys.stdout.flush()

        # end for

        inter = []
        intra = []
        ldamodel_cs = malletmodel2ldamodel(
            Serialization.load_obj('lda.mallet.cs'))
        for i in range(30):
            print('processing', i)
            ldamodel_mono1 = malletmodel2ldamodel(
                Serialization.load_obj('lda.mallet.monolingual.part1.' +
                                       str(i)))
            ldamodel_mono2 = malletmodel2ldamodel(
                Serialization.load_obj('lda.mallet.monolingual.part2.' +
                                       str(i)))
            diff_matrix1, _ = ldamodel_cs.diff(ldamodel_mono1,
                                               distance='jaccard')
            diff_matrix2, _ = ldamodel_cs.diff(ldamodel_mono2,
                                               distance='jaccard')
            #intra.append(np.mean([np.mean(np.matrix(diff_matrix1)), np.mean(np.matrix(diff_matrix2))]))
            intra.append(
                np.mean([
                    np.min(np.matrix(diff_matrix1)),
                    np.min(np.matrix(diff_matrix2))
                ]))
            diff_matrix3, _ = ldamodel_mono1.diff(ldamodel_mono2,
                                                  distance='jaccard')
            #inter.append(np.mean(np.matrix(diff_matrix3)))
            inter.append(np.min(np.matrix(diff_matrix3)))
        # end for

        print(np.mean(intra), np.mean(inter))
        _, pval = ranksums(intra, inter)
        print('pval:', pval)
def build(mallet_path,
          tweets_df,
          output_dir,
          num_topics=100,
          num_iterations=1000,
          verbose=False):
    """
    Builds a topic model from the given tweets. Writes the following files into
    the output directory:

    * summary.json: A JSON summary of the model, including top word
        weights
    * tweet_topics.csv: A CSV file indexed by tweet ID where each column represents
        the weight of a given topic in the tweet.
    * model.pkl: A pickle file that contains the LDA model.
    """

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    print("Preprocessing tweets...")
    initial_data = tweets_df.standardized_text.values.tolist()
    lemmatized_data = [tweet.split() for tweet in tqdm.tqdm(initial_data)]

    # Create dictionary and term-frequency mapping
    print("Building corpus...")
    id2word = corpora.Dictionary(lemmatized_data)
    tf = [id2word.doc2bow(tweet) for tweet in lemmatized_data]
    id2word.save(os.path.join(output_dir, "dictionary.pkl"))

    if verbose:
        print(
            'The corpus (token_id, #occurances in this doc) for example tweets are:'
        )
        print(
            'Note words are in order of token_id, not order of original tweet')
        for i in range(20):
            print(tf[i])

        print('The first 20 words in the dictionary are:')
        for i in range(20):
            print(i, id2word[i])

    # Build the LDA model
    start_time = datetime.datetime.now()
    print("Building model...")
    if verbose: print('Started at ', str(start_time))

    lda_model = gensim.models.wrappers.LdaMallet(mallet_path=os.path.join(
        mallet_path, "bin", "mallet"),
                                                 corpus=tf,
                                                 num_topics=num_topics,
                                                 iterations=num_iterations,
                                                 id2word=id2word)

    end_time = datetime.datetime.now()
    print("Saving model...")
    lda_model.save(os.path.join(output_dir, "model.pkl"))

    if verbose:
        print('Elapsed time: {}'.format(str(end_time - start_time)))

    # Write outputs
    gensim_lda_model = malletmodel2ldamodel(lda_model)
    write_summary(lda_model,
                  gensim_lda_model,
                  lemmatized_data,
                  id2word,
                  tf,
                  output_dir,
                  num_topics=num_topics)
    write_tweet_topics(tweets_df,
                       gensim_lda_model,
                       id2word,
                       tf,
                       output_dir,
                       num_topics=num_topics)

    print("Done.")
Ejemplo n.º 18
0
class Topics(object):
    __dict_path = os.path.join(os.path.dirname(__file__),
                               'models/mallet-dict.pkl')
    __model_path = os.path.join(os.path.dirname(__file__),
                                'models/mallet-model.model')
    __mallet_path = os.path.join(os.path.dirname(__file__),
                                 'models/mallet/bin/mallet')
    __topic_file_path = os.path.join(os.path.dirname(__file__),
                                     'models/topic-files/')

    dictionary = unpickle(__dict_path)
    model = LdaMallet.load(__model_path)
    model.mallet_path = __mallet_path
    model.prefix = __topic_file_path

    model_fast = malletmodel2ldamodel(model, 0.1, 1000)

    topic_map = {
        0: 'education',
        1: 'dating',
        2: 'change',
        3: 'communication',
        4: 'broken relationship',  # relationship status
        5: 'finances and accounting',
        6: 'excessive thoughts',
        7: 'politics',
        8: 'financial investments',
        9: 'physical health',
        10: 'work',
        11: 'sleep',
        12: 'emotions',
        13: 'medication regimen',
        14: 'past experiences / decisions',  # or decisions
        15: 'general apathy',
        16: 'NaN',  # ignore
        17: 'relocation',
        18: 'social stressors',
        19: 'memories',
        20: 'financial decisions',
        21: 'family',
        22: 'nutrition and weight',
        23: 'relationships',
        24: 'marital issues',
        25: 'religion and belief systems',
        26: 'experiences',
        27: 'financial pressure',
        28: 'romantic relationship',
        29: 'relationship issues',
        30: 'routines',
        31: 'taxes and claims',  # income and benefits
        32: 'symptoms of mental illness',
        33: 'dispute and argument',
        34: 'lack of motivation',
        35: 'reflection and mindfulness',
        36: 'event or festivity',
        37: 'self-harm',  # suicide
        38: 'resources and information',
        39: 'addiction',
        40: 'addiction recovery',
        41: 'leisure'
    }

    def get_topics(self, topics):
        top_topics = topics[:, 1].argsort()[-5:][::-1]

        # TODO: weight down scoring
        scores = 0.
        results = {}
        for idx, entry in enumerate(top_topics):
            topic = int(topics[entry][0])
            score = topics[entry][1]

            if idx == 0 and score <= .1:
                return None

            if scores < .55:
                if self.topic_map[topic] != 'NaN':
                    results[self.topic_map[topic]] = score
                scores += score
            else:
                break

        return results

    # def retrieve(self, doc):
    #    tokens = self.get_tokens(doc)
    #    bow = self.dictionary.doc2bow(tokens)
    #    topics = np.array(self.model[bow])
    #    return self.get_topics(topics)

    def retrieve(self, doc):
        tokens = self.get_tokens(doc)
        bow = self.dictionary.doc2bow(tokens)
        topics = np.array(self.model_fast[bow])
        return self.get_topics(topics)

    @staticmethod
    def get_tokens(doc):
        result = []
        for tok in doc:

            if tok.pos_ in ['IN', 'MD', 'CD']:
                continue

            if tok.is_digit or tok.like_num:
                continue

            if tok.is_punct:
                continue

            elif tok.is_stop:
                continue

            else:
                result.append(tok.text.lower())

        return result
doc_term_mat_train = [dictionary.doc2bow(doc) for doc in docs_train]
doc_term_mat_test = [dictionary.doc2bow(doc) for doc in docs_test]

path_to_mallet_binary = r'C:\mallet\bin\mallet'
if __name__ == "__main__":
    model = LdaMallet(path_to_mallet_binary,
                      corpus=doc_term_mat_train,
                      alpha=5,
                      num_topics=10,
                      id2word=dictionary,
                      optimize_interval=50)

    topics = model.print_topics()
    for topic in topics:
        print(topic)

    # Compute Coherence Score for base model
    coherence_model_lda = CoherenceModel(model=model,
                                         corpus=doc_term_mat_train,
                                         texts=docs_train,
                                         dictionary=dictionary,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    gensim_model = ldamallet.malletmodel2ldamodel(model)
    # Visualize the topics
    vis_prepared = pyLDAvis.gensim.prepare(gensim_model, doc_term_mat_train,
                                           dictionary)
    pyLDAvis.save_html(vis_prepared, "mallet.html")
    print('\nCoherence Score: ', coherence_lda)
    plot_word_cloud(gensim_model)
Ejemplo n.º 20
0
    df['lemmatized'] = df['no_stop_words'].parallel_apply(lemmatization)
    print(df['lemmatized'].head(15))

    dictionary = corpora.Dictionary(df['lemmatized'])
    dictionary.filter_extremes(no_below=10, no_above=0.1)

    df['doc_to_bowed'] = df['lemmatized'].parallel_apply(dictionary.doc2bow)
    print(df['doc_to_bowed'].head(5))

    corpus = df['doc_to_bowed']

    ldamallet = gensim.models.wrappers.LdaMallet(mallet_path,
                                                 corpus=corpus,
                                                 num_topics=TOPICS_NUMBER,
                                                 id2word=dictionary)
    lda_native = malletmodel2ldamodel(ldamallet)
    lda_native.save("model/mallet_to_native.lda")

    corpora.Dictionary.save(dictionary, "model/dictionary.dict")
    corpora.BleiCorpus.save_corpus(fname="model/corpus.lda-c", corpus=corpus)

    pprint(lda_native.show_topics(20))

    # MIGHT BE USEFUL, SO DONT REMOVE IT
    # bigram = gensim.models.Phrases(
    #     df['bigrams'], min_count=5, threshold=100
    # )  # higher threshold fewer phrases.
    # trigram = gensim.models.Phrases(bigram[df['text_new']], threshold=100)

    # # Faster way to get a sentence clubbed as a trigram/bigram
    # bigram_mod = gensim.models.phrases.Phraser(bigram)
# Create a corpus from a list of texts
data = [a.split() for a in res]

dictionary = Dictionary(data)

corpus = [dictionary.doc2bow(t) for t in data]

# os.environ['MALLET_HOME'] = 'X:\\Programs\\mallet\\mallet-2.0.8\\'
mallet_path = 'X:\\Programs\\mallet\\mallet-2.0.8\\bin\\mallet.bat'

# Train the model on the corpus.
lda = LdaMallet(mallet_path, corpus, id2word=dictionary, num_topics=10)
# lda = LdaModel(corpus, id2word=dictionary, num_topics=10, alpha='auto', eval_every=5, chunksize=10, passes=10, decay=0.9)

elapsed_time = time.time() - start_time
print(time.strftime("Lda model criado, demorou %H:%M:%S:%m", time.gmtime(elapsed_time)))

for index, topic in lda.show_topics(formatted=False, num_words=10):
    print('Topic: {} \nWords: {}'.format(
        index, [pipe.predict([w[0]])[0][0] for w in topic]))

# for index, topic in lda.show_topics(formatted=False, num_words=20):
#     print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic]))

model = malletmodel2ldamodel(lda)
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)


vis
pyLDAvis.save_html(vis, 'lda-pt-mallet.html')