def prepare(topic_model, corpus, dictionary, doc_topic_dist=None, **kwargs): """Transforms the Gensim TopicModel and related corpus and dictionary into the data structures needed for the visualization. Parameters ---------- topic_model : gensim.models.hdpmodel.HdpModel An already trained Gensim HdpModel. corpus : array-like list of bag of word docs in tuple form or scipy CSC matrix The corpus in bag of word form, the same docs used to train the model. The corpus is transformed into a csc matrix internally, if you intend to call prepare multiple times it is a good idea to first call `gensim.matutils.corpus2csc(corpus)` and pass in the csc matrix instead. For example: [(50, 3), (63, 5), ....] dictionary: gensim.corpora.Dictionary The dictionary object used to create the corpus. Needed to extract the actual terms (not ids). doc_topic_dist (optional): Document topic distribution from LDA (default=None) The document topic distribution that is eventually visualised, if you will be calling `prepare` multiple times it's a good idea to explicitly pass in `doc_topic_dist` as inferring this for large corpora can be quite expensive. **kwargs : additional keyword arguments are passed through to :func:`pyldavis.prepare`. Returns ------- prepared_data : PreparedData the data structures used in the visualization Example -------- For example usage please see this notebook: http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/Gensim%20Newsgroup.ipynb See ------ See `pyLDAvis.prepare` for **kwargs. """ # we use sklearn's multi-dimensional scaling as the default measure to approximate distance between topics # should be a slightly more stable implementation compared to skbio's PCoA if 'mds' not in kwargs: kwargs['mds'] = js_MDS opts = fp.merge(_extract_data(topic_model, corpus, dictionary, doc_topic_dist), kwargs) return vis_prepare(**opts)
def prepare(corpus, dictionary, doc_topic_file, topic_term_file, **kwargs): """Transforms the Mahout LDA and related corpus and dictionary into the data structures needed for the visualization. Parameters ---------- corpus : array-like list of bag of word docs in tuple form or scipy CSC matrix The corpus in bag of word form, the same docs used to train the model. The corpus is transformed into a csc matrix internally, if you intend to call prepare multiple times it is a good idea to first call `gensim.matutils.corpus2csc(corpus)` and pass in the csc matrix instead. For example: [(50, 3), (63, 5), ....] dictionary: gensim.corpora.Dictionary The dictionary object used to create the corpus. Needed to extract the actual terms (not ids). doc_topic_file : file handle to Document topic distribution from Mahout LDA The document topic distribution that is eventually visualised topic_term_file : file handle to topic term distribution from Mahout LDA The document topic distribution that is eventually visualised **kwargs : additional keyword arguments are passed through to :func:`pyldavis.prepare`. Returns ------- prepared_data : PreparedData the data structures used in the visualization See ------ See `pyLDAvis.prepare` for **kwargs. """ # we use sklearn's multi-dimensional scaling as the default measure to approximate distance between topics # should be a slightly more stable implementation compared to skbio's PCoA if 'mds' not in kwargs: kwargs['mds'] = js_MDS doc_topic_dist = get_doc_topic(doc_topic_file) topic_term_dists = get_topic_term(topic_term_file, dictionary) opts = fp.merge(_extract_data(corpus, dictionary, doc_topic_dist, topic_term_dists), kwargs) return vis_prepare(**opts)
def prepare(topic_model, corpus, dictionary, doc_topic_dist=None, **kwargs): """Transforms the Gensim TopicModel and related corpus and dictionary into the data structures needed for the visualization. Parameters ---------- topic_model : gensim.models.ldamodel.LdaModel An already trained Gensim LdaModel. The other gensim model types are not supported (PRs welcome). corpus : array-like list of bag of word docs in tuple form or scipy CSC matrix The corpus in bag of word form, the same docs used to train the model. The corpus is transformed into a csc matrix internally, if you intend to call prepare multiple times it is a good idea to first call `gensim.matutils.corpus2csc(corpus)` and pass in the csc matrix instead. For example: [(50, 3), (63, 5), ....] dictionary: gensim.corpora.Dictionary The dictionary object used to create the corpus. Needed to extract the actual terms (not ids). doc_topic_dist (optional): Document topic distribution from LDA (default=None) The document topic distribution that is eventually visualised, if you will be calling `prepare` multiple times it's a good idea to explicitly pass in `doc_topic_dist` as inferring this for large corpora can be quite expensive. **kwargs : additional keyword arguments are passed through to :func:`pyldavis.prepare`. Returns ------- prepared_data : PreparedData the data structures used in the visualization Example -------- For example usage please see this notebook: http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/Gensim%20Newsgroup.ipynb See ------ See `pyLDAvis.prepare` for **kwargs. """ opts = fp.merge(_extract_data(topic_model, corpus, dictionary, doc_topic_dist), kwargs) return vis_prepare(**opts)