Esempio n. 1
0
        proba(term | topic) = beta[topic][term]

        We shall for each topic find the top 20 words that contribute 
        to a document being classified as said topic
        """
        top_20_per_topic = np.argsort(self.beta * (-1), axis=1)
        for i in range(self.nb_topics):
            for j in range(self.nb_terms):
                if top_20_per_topic[i][j] < 20:
                    print(self.index[j], end=" ")
            print()


if __name__ == "__main__":
    """
    Example of application using newsgroups
    """
    from sklearn.datasets import fetch_20newsgroups

    train = fetch_20newsgroups(subset='train',
                               remove=('headers', 'footers', 'quotes'))

    pp = Preprocessing()

    index, bow = pp.build_bow(pp.corpus_preproc(train["data"]))

    lda = LDA(5, bow, index, alpha=0.1, set_alpha=True)

    lda.estimation(max_iter_em=100, max_iter_var=10)

    lda.display_word_topic_association()