Esempio n. 1
0
def _kmeans_to_prepared_data_pyldavis_score(x, index2word,
    centers, labels, embedding_method='tsne', radius=3.5,
    n_candidate_words=50, n_printed_words=30, lambda_step=0.01):
    """
    Dont use pyLDAvis embedding method. It shows unstable training results.
    """

    topic_term_dists = normalize(centers, norm='l1')

    empty_clusters = np.where(topic_term_dists.sum(axis=1) == 0)[0]
    default_weight = 1 / centers.shape[1]
    topic_term_dists[empty_clusters,:] = default_weight

    doc_topic_dists = np.zeros((x.shape[0], centers.shape[0]))
    for d, label in enumerate(labels):
        doc_topic_dists[d,label] = 1

    doc_lengths = x.sum(axis=1).A.ravel()

    term_frequency = x.sum(axis=0).A.ravel()
    term_frequency[term_frequency == 0] = 0.01 # preventing zeros

    if embedding_method == 'tsne':
        return pyLDAvis.prepare(
            topic_term_dists, doc_topic_dists, doc_lengths, index2word, term_frequency,
            R=radius, lambda_step=lambda_step, sort_topics=True,
            plot_opts={'xlab': 't-SNE1', 'ylab': 't-SNE2'}
        )
    else:
        return pyLDAvis.prepare(
            topic_term_dists, doc_topic_dists, doc_lengths, index2word, term_frequency,
            R=radius, lambda_step=lambda_step
        )
 def visualize_lda_mallet(self, **kwargs):
     """Visualize LDA model using pyLDAvis"""
     dataDir = "../data/mallet_files"  # update this if needed
     statefile = 'state.mallet.gz'
     data = get_LDA_data(dataDir, statefile)
     vis = pyLDAvis.prepare(sort_topics=False, **data, **kwargs)
     return vis
Esempio n. 3
0
def prepare(lda_model, dtm, id2term, **kwargs):
    """Create Prepared Data from sklearn's LatentDirichletAllocation and
    CountVectorizer.

    Parameters
    ----------
    lda_model : sklearn.decomposition.LatentDirichletAllocation.
        Latent Dirichlet Allocation model from sklearn fitted with `dtm`

    dtm : array-like or sparse matrix, shape=(n_samples, n_features)
        Document-term matrix used to fit on LatentDirichletAllocation model
        (`lda_model`)

    id2term: the <feature id>:<term word> dictionary

    **kwargs: Keyword argument to be passed to pyLDAvis.prepare()


    Returns
    -------
    prepared_data : PreparedData
          the data structures used in the visualization


    Example
    --------
    For example usage please see this notebook:
    http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb

    See
    ------
    See `pyLDAvis.prepare` for **kwargs.
    """
    opts = fp.merge(_extract_data(lda_model, dtm, id2term), kwargs)
    return lda.prepare(**opts)
Esempio n. 4
0
def generate_ldavis_data_v1(data_path, run_name, model, idx_to_word, freqs,
                            vocab_size):
    """This function will launch a locally hosted session of pyLDAvis to visualize the results of our model"""
    doc_embed = model.sess.run(model.doc_embedding)
    topic_embed = model.sess.run(model.topic_embedding)
    word_embed = model.sess.run(model.word_embedding)

    # Extract all unique words in order of index: 0 - vocab_size
    vocabulary = []
    # NOTE! Keras Tokenizer indexes from 1, 0 is reserved for PAD token
    for i in range(1, vocab_size + 1):
        vocabulary.append(idx_to_word[i])

    # Read document lengths
    doc_lengths = np.load(data_path / run_name / 'doc_lengths.npy')

    # The `prepare_topics` function is a direct copy from Chris Moody
    vis_data = prepare_topics(doc_embed,
                              topic_embed,
                              word_embed,
                              np.array(vocabulary),
                              doc_lengths=doc_lengths,
                              term_frequency=freqs,
                              normalize=True)
    prepared_vis_data = pyLDAvis.prepare(**vis_data)
    pyLDAvis.show(prepared_vis_data)
Esempio n. 5
0
def prepare(model_data_path, ignore_topics=[], ignore_terms=[], **kwargs):
    """Create Prepared Data from sklearn's LatentDirichletAllocation and CountVectorizer.

    Parameters
    ----------
    model_data_path : Path where TwitterLDA stored it's data output

    Returns
    -------
    prepared_data : PreparedData
          the data structures used in the visualization


    Example
    --------
    For example usage please see this notebook:
    http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb

    See
    ------
    See `pyLDAvis.prepare` for **kwargs.
    """
    opts = fp.merge(
        _extract_data(model_data_path, ignore_topics, ignore_terms), kwargs)
    opts['sort_topics'] = False
    return pyLDAvis.prepare(**opts)
Esempio n. 6
0
def ldavis_show(metagenome, sample_probs, otu_probs, output=None):
    import pyLDAvis

    taxa_info = (metagenome.taxonomy.data.loc[metagenome.abundance.columns,
                                              ['Class', 'Genus']].apply(
                                                  lambda x: ';'.join(x),
                                                  axis=1))

    LDAvis_prepared = pyLDAvis.prepare(
        otu_probs.values,  # (topics x otus)
        sample_probs,  # (samples x topics)
        metagenome.abundance.data.sum(axis=1),  # (samples)
        taxa_info,  # (otus)
        metagenome.abundance.data.sum(axis=0).values)  # (otus)

    LDAvis_data_filepath = '{}/ldavis_prep.pkl'.format(str(metagenome.outdir))

    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(LDAvis_prepared, '{}/{}'.format(metagenome.figdir,
                                                       output))
Esempio n. 7
0
def prepare(topic_model, docs, **kargs):
    """Transforms the GraphLab TopicModel and related corpus data into
    the data structures needed for the visualization.

    Parameters
    ----------
    topic_model : graphlab.toolkits.topic_model.topic_model.TopicModel
        An already trained GraphLab topic model.
    docs : SArray of dicts
        The corpus in bag of word form, the same docs used to train the model.
    **kwargs :
        additional keyword arguments are passed through to :func:`pyldavis.prepare`.

    Returns
    -------
    prepared_data : PreparedData
        the data structures used in the visualization

    Example
    --------
    For example usage please see this notebook:
    http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/GraphLab.ipynb
    """
    opts = fp.merge(_extract_data(topic_model, docs), kargs)
    return pyLDAvis.prepare(**opts)
Esempio n. 8
0
def lda_viz(docs, lengths, n_features, n_topics, n_top_words):
    n_samples = len(docs)

    norm = lambda data: pandas.DataFrame(data).div(data.sum(1), axis=0).values

    vect = CountVectorizer(max_df=0.95,
                           min_df=2,
                           max_features=n_features,
                           stop_words='english')
    vected = vect.fit_transform(docs)
    lda = LatentDirichletAllocation(n_topics=n_topics,
                                    max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
    doc_topic_dists = norm(lda.fit_transform(vected))

    prepared = pyLDAvis.prepare(
        doc_lengths=lengths,
        vocab=vect.get_feature_names(),
        term_frequency=vected.sum(axis=0).tolist()[0],
        topic_term_dists=norm(lda.components_),
        doc_topic_dists=doc_topic_dists,
    )

    #print(doc_topic_dists)
    #print(n_samples)
    return prepared, doc_topic_dists
Esempio n. 9
0
def generate_ldavis_data(data_path, run_name, model, idx_to_word, freqs,
                         vocab_size):
    """This method will launch a locally hosted session of
    pyLDAvis that will visualize the results of our model
    """
    doc_embed = model.sesh.run(model.doc_embedding)
    topic_embed = model.sesh.run(model.topic_embedding)
    word_embed = model.sesh.run(model.word_embedding)

    # Extract all unique words in order of index 0-vocab_size
    vocabulary = []
    for i in range(vocab_size):
        vocabulary.append(idx_to_word[i])

    # Read in document lengths
    doc_lengths = np.load(data_path + "/" + run_name + "/" + "doc_lengths.npy")

    # The prepare_topics function is a direct copy from Chris Moody
    vis_data = prepare_topics(doc_embed,
                              topic_embed,
                              word_embed,
                              np.array(vocabulary),
                              doc_lengths=doc_lengths,
                              term_frequency=freqs,
                              normalize=True)

    prepared_vis_data = pyLDAvis.prepare(**vis_data)
    pyLDAvis.show(prepared_vis_data)
Esempio n. 10
0
def prepare(lda_model, dtm, vectorizer, **kwargs):
    """Create Prepared Data from sklearn's LatentDirichletAllocation and CountVectorizer.

    Parameters
    ----------
    lda_model : sklearn.decomposition.LatentDirichletAllocation.
        Latent Dirichlet Allocation model from sklearn fitted with `dtm`

    dtm : array-like or sparse matrix, shape=(n_samples, n_features)
        Document-term matrix used to fit on LatentDirichletAllocation model (`lda_model`)

    vectorizer : sklearn.feature_extraction.text.(CountVectorizer, TfIdfVectorizer).
        vectorizer used to convert raw documents to document-term matrix (`dtm`)

    **kwargs: Keyword argument to be passed to pyLDAvis.prepare()


    Returns
    -------
    prepared_data : PreparedData
          the data structures used in the visualization


    Example
    --------
    For example usage please see this notebook:
    http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb

    See
    ------
    See `pyLDAvis.prepare` for **kwargs.
    """
    opts = fp.merge(_extract_data(lda_model, dtm, vectorizer), kwargs)
    return pyLDAvis.prepare(**opts)
Esempio n. 11
0
def plot_pyldavis(topic_model, document_topic_matrix, document_term_matrix, file=None, **kwargs):
    """
    Generate a pyLDAvis visualization of the given topic model. For more information about the visualization read the `original paper <http://www.aclweb.org/anthology/W14-3110>`_ by Sievert and Shirley. Note that pyLDAvis only supports LDA models,
     passing a nmf model will cause an exception.

    :param document_topic_matrix: A document-topic matrix as returned by calling get_document_topic_matrix() on a topic model.
    :type document_topic_matrix: np.ndarray
    :param document_term_matrix: Term count weighted document-term matrix of the documents used to infer the document_topic_matrix.
    :type document_term_matrix: np.ndarray
    :param file: Path to store the HTML output. If no file is passed the plot is visualized in the browser.
    :type file: str
    :param kwargs: Further parameters passed directly to pyLDAvis's prepare function. See the `documentation <http://pyldavis.readthedocs.io/en/latest/modules/API.html#pyLDAvis.prepare>`_ for options. Note, that sort_topics=False is already set.
    """
    if topic_model.model_name != 'lda':
        raise Exception('pyLDAvis only supports LDA. {} not supported'.format(topic_model.model_name))
    topic_token_matrix = topic_model.get_topic_token_matrix(normalize=True)
    id2word = topic_model.id2token

    document_lengths = np.sum(document_term_matrix, axis=1).getA1()
    term_frequencies = np.sum(document_term_matrix, axis=0).getA1()
    prepared_data = pyLDAvis.prepare(topic_token_matrix, document_topic_matrix, document_lengths, id2word,
                                     term_frequencies, sort_topics=False, **kwargs)

    ROOT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')
    REPORT_DIR = os.path.join(ROOT_DIR, 'reports')
    if file:
        base_path = os.path.join(REPORT_DIR, 'figures/pyLDAvis')
        pa = os.path.join(base_path, file)
        with open(pa, 'w') as f:
         pyLDAvis.save_html(prepared_data, f)
    else:
        pyLDAvis.show(prepared_data)
Esempio n. 12
0
def prepare(topic_model, docs, **kargs):
    """Transforms the GraphLab TopicModel and related corpus data into
    the data structures needed for the visualization.

    Parameters
    ----------
    topic_model : graphlab.toolkits.topic_model.topic_model.TopicModel
        An already trained GraphLab topic model.
    docs : SArray of dicts
        The corpus in bag of word form, the same docs used to train the model.
    **kwargs :
        additional keyword arguments are passed through to :func:`pyldavis.prepare`.

    Returns
    -------
    prepared_data : PreparedData
        the data structures used in the visualization

    Example
    --------
    For example usage please see this notebook:
    http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/GraphLab.ipynb
    """
    opts = fp.merge(_extract_data(topic_model, docs), kargs)
    return pyLDAvis.prepare(**opts)
Esempio n. 13
0
def generate_ldavis_data(data_path, model, idx_to_word, freqs, vocab_size):
    """
    This function will launch a locally hosted session of pyLDAvis to visualize the results of our model.

    :param data_path: (PosixPath) data location
    :param model: TensorFlow model
    :param idx_to_word: (dict) index-to-word mapping
    :param freqs: (list) frequency counts of each token
    :param vocab_size: (int) size of vocabulary
    :return:
    """
    doc_embed = model.sess.run(model.doc_embedding)
    topic_embed = model.sess.run(model.topic_embedding)
    word_embed = model.sess.run(model.word_embedding)

    # Extract all unique words in order of index: 1 - (vocab_size + 1)
    # NOTE! Keras Tokenizer indexes from 1, 0 is reserved for PAD token
    vocabulary = ['<PAD>']
    for i in range(1, vocab_size):
        vocabulary.append(idx_to_word[i])

    # Read document lengths
    doc_lengths = np.load(data_path / 'doc_lengths.npy')

    # The `prepare_topics` function is a direct copy from Chris Moody
    vis_data = prepare_topics(doc_embed,
                              topic_embed,
                              word_embed,
                              np.array(vocabulary),
                              doc_lengths=doc_lengths,
                              term_frequency=freqs,
                              normalize=True)
    prepared_vis_data = pyLDAvis.prepare(**vis_data)
    pyLDAvis.show(prepared_vis_data)
Esempio n. 14
0
def genSTTMHtml(data, uid):
    print('数据预处理...')
    first = Series(data).apply(chinese_word_cut)  # 分词
    tmp = first[first.notnull()]

    docs = [item for item in tmp if len(item) > 2]
    if len(docs) < 2:
        print('数据量过少')
        return
    K = min(max(len(docs) // 100, 2), 10)

    # docs_len = len(docs)
    # if docs_len < 10:
    #     print('数据量过少')
    #     return;
    # if docs_len < 30:
    #     K=2
    # elif docs_len < 100:
    #     K=3
    # elif docs_len < 200:
    #     K=5
    mgp = gen_mgp(K)
    vocab = set([x for doc in docs for x in doc])
    mgp.fit(docs, len(vocab))
    showResult(mgp)
    print('模型可视化...')
    pytest = prepare_data(mgp, docs, vocab)
    movies_vis_data = pyLDAvis.prepare(**pytest)

    filename = '{}.html'.format(uid)
    filepath = os.path.join(save_html_dir, filename)
    pyLDAvis.save_html(movies_vis_data, filepath)
    return os.path.join('/assets', 'html', filename)
Esempio n. 15
0
def pylda_vis(args, model, corpus, time_slices, pre):
    """
        Function to visualize model using pyLDAvis

        input:
            args (argparse object): input arguments
            model: LDA model to visualize
            corpus: corpus to run LDA over
            time_slices (list): list containing number of files per time time slice
            pre (str): path to save all results to

        returns
    """
    print(timestamp() + " About to visualize...", file=sys.stderr)
    for slice in range(len(time_slices)):
        doc_topic, topic_term, doc_lengths, term_frequency, vocab = model.dtm_vis(
            time=slice, corpus=corpus)
        vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term,
                                       doc_topic_dists=doc_topic,
                                       doc_lengths=doc_lengths,
                                       vocab=vocab,
                                       term_frequency=term_frequency,
                                       sort_topics=True)
        pyLDAvis.save_html(vis_wrapper,
                           pre + "time_slice_" + str(slice) + ".html")
        print(timestamp() + " Prepared time slice",
              slice,
              "for pyLDAvis...",
              file=sys.stderr)
Esempio n. 16
0
def plot_lda_vis(model_data, mode='show', filename=None):
    """Designed to work with to_py_lda_vis() in the model classes."""
    from pyLDAvis import prepare, save_html, show
    model_vis_data = prepare(**model_data)
    if mode == 'save_html' and filename:
        save_html(model_vis_data, filename)
    else:
        show(model_vis_data)
def learn_topic_model(X, vocab, graphlets, config, dbg=False):

    alpha = config['dirichlet_params']['alpha']
    eta = config['dirichlet_params']['eta']
    model = lda.LDA(n_topics=config['n_topics'], n_iter=config['n_iters'], random_state=1, alpha=alpha, eta=eta)

    model.fit(X)  # model.fit_transform(X) is also available
    topic_word = model.topic_word_  # model.components_ also works
    n_top_words = 30

    feature_freq = (X != 0).sum(axis=0)
    doc_lengths = (X != 0).sum(axis=1)

    try:
        print "phi: %s. theta: %s. nd: %s. vocab: %s. Mw: %s" \
        %( model.topic_word_.shape, model.doc_topic_.shape, doc_lengths.shape, len(graphlets.keys()), len(feature_freq))
        data = {'topic_term_dists': model.topic_word_,
                'doc_topic_dists': model.doc_topic_,
                'doc_lengths': len(graphlets.keys()),
                'vocab': graphlets.keys(),
                'term_frequency': X}

        import pyLDAvis
        vis_data = pyLDAvis.prepare(model.topic_word_, model.doc_topic_, doc_lengths, graphlets.keys(), feature_freq)
        # vis_data = pp.prepare(model.topic_word_, model.doc_topic_, doc_lengths, graphlets.keys(), feature_freq)
        html_file = "../LDAvis/Learnt_Models/topic_model_" + id + ".html"
        pyLDAvis.save_html(vis_data, html_file)
        print "PyLDAVis ran. output: %s" % html_file

        """investigate the objects used in the topics"""
        print("\ntype(topic_word): {}".format(type(topic_word)))
        print("shape: {}".format(topic_word.shape))
        topics = {}
        for i, topic_dist in enumerate(topic_word):
            objs = []
            topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
            #print('Topic {}: {}'.format(i, ' '.join( [repr(i) for i in topic_words] )))
            for j in [graphlets[k] for k in topic_words]:
                objs.extend(object_nodes(j)[0])
            topics[i] = objs
            if dbg:
                print('Topic {}: {}'.format(i, list(set(objs))))

    except ImportError:
        print "No module pyLDAvis. Cannot visualise topic model"

    """investigate the highly probably topics in each document"""
    doc_topic = model.doc_topic_
    # #Each document's most probable topic - don't have the UUIDs, so dont use this.
    # pred_labels = []
    # for n in range(doc_topic.shape[0]):
    #     if max(doc_topic[n]) > config['class_thresh']:
    #         topic_most_pr = doc_topic[n].argmax()
    #         pred_labels.append(topic_most_pr)

    return doc_topic, topic_word #, pred_labels
Esempio n. 18
0
def lda_vis(modeled_corpus, mode='show', filename=None):
    """Designed to work with to_py_lda_vis() in the model classes."""
    from pyLDAvis import prepare, show, save_html

    model_vis_data = _to_py_lda_vis(modeled_corpus)
    prepared_model_vis_data = prepare(**model_vis_data)
    if mode == 'save_html' and filename:
        logging.info("Saving pyLDAVis to {}".format(filename))
        save_html(prepared_model_vis_data, filename)
    else:
        show(prepared_model_vis_data, ip="0.0.0.0", port=8888)
Esempio n. 19
0
def lda_vis(modeled_corpus, mode='show', filename=None):
    """Designed to work with to_py_lda_vis() in the model classes."""
    from pyLDAvis import prepare, show, save_html

    model_vis_data = _to_py_lda_vis(modeled_corpus)
    prepared_model_vis_data = prepare(**model_vis_data)

    if mode == 'save_html' and filename:
        save_html(prepared_model_vis_data, filename)
    else:
        show(prepared_model_vis_data)
Esempio n. 20
0
def lda_vis(modeled_corpus, mode='show', filename=None):
    """Designed to work with to_py_lda_vis() in the model classes."""
    from pyLDAvis import prepare, show, save_html

    model_vis_data = _to_py_lda_vis(modeled_corpus)
    prepared_model_vis_data = prepare(**model_vis_data)
    if mode == 'save_html' and filename:
        logging.info("Saving pyLDAVis to {}".format(filename))
        save_html(prepared_model_vis_data, filename)
    else:
        show(prepared_model_vis_data, ip="0.0.0.0", port=8888)
def lda_viz(topic_2_term, topic_2_doc, doc_lengths, vocab_, term_frequency):
    # create pyLDAvis object
    prepared_data = pyLDAvis.prepare(
        topic_term_dists=topic_2_term,
        doc_topic_dists=topic_2_doc,
        doc_lengths=doc_lengths,
        vocab=vocab_,
        term_frequency=term_frequency,
        start_index=0,
        sort_topics=False,
    )
    pyLDAvis.save_html(prepared_data, "ldaviz.html")
def ldavis_create(lda,
                  corpus,
                  gensim_dict,
                  LDAvis_data_filepath=fpathroot + fpathappend + '_lda_vis',
                  return_ldavis=False):
    LDAvis_prepared = pyLDAvis.prepare(lda, corpus, gensim_dict)
    with open(LDAvis_data_filepath, 'w') as f:
        pickle.dump(LDAvis_prepared, f)
    if return_ldavis == True:
        return LDAvis_prepared
    else:
        pyLDAvis.display(LDAvis_prepared)
Esempio n. 23
0
def visualise_ldamallet_topics(dataset, alpha, num_topic):
    '''
    Extracts relevant information form ldamallet's LDA model and visualizes the topics with Gensim's LDA visualisation
    :return: visualisation
    '''
    ldamallet_dir = 'data/topic_models/basic/{}_alpha{}_{}/ldamallet'.format(
        dataset, alpha, num_topic)  # e.g. Semeval_alpha50_20
    convertedLDAmallet = convertLDAmallet(dataDir=ldamallet_dir,
                                          filename='state.mallet.gz')
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.prepare(**convertedLDAmallet)
    # pyLDAvis.display(vis)
    return vis
Esempio n. 24
0
 def new(cls, name: str, dataset: Dataset, model: TopicModel,
         **kwargs) -> "Visualizer":
     path = common.PROJDIR / (name + ".LDAvis.json")
     pyLDAvis.save_json(
         pyLDAvis.prepare(model.get_topic_word_matrix(normalize=True),
                          model.get_doc_topic_matrix(normalize=True),
                          dataset.get_count_matrix().sum(axis=1).squeeze(),
                          [word.decode() for word in dataset.get_vocab()],
                          dataset.get_count_matrix().sum(axis=0).squeeze(),
                          **kwargs),
         str(path),
     )
     return cls(path)
Esempio n. 25
0
    def make_pyLDAVis(self, mdl, visualization_file='./visualization.html'):
        import pyLDAvis
        topic_term_dists = np.stack(
            [mdl.get_topic_word_dist(k) for k in range(mdl.k)])
        doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
        doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
        doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
        vocab = list(mdl.used_vocabs)
        term_frequency = mdl.used_vocab_freq

        prepared_data = pyLDAvis.prepare(topic_term_dists, doc_topic_dists,
                                         doc_lengths, vocab, term_frequency)
        pyLDAvis.save_html(prepared_data, visualization_file)
Esempio n. 26
0
    def prepare_visualization(self, documents: List[List[str]]) -> pyLDAvis:
        """
        Prepare documents for visualization from trained model

        :param documents: List[List[str]]
            Tokenized documents

        :return: pyLDAvis
            Prepared word matrix, documents distances, vocabulary and word counts using pyLDAvis library
        """
        _voc: List[str] = []
        for cluster in self.cluster_word_distribution:
            _voc.extend(list(cluster.keys()))
        _vocabulary: List[str] = list(set(_voc))
        _doc_topic_distances: List[List[float]] = [self.predict_proba(doc) for doc in documents]
        for doc in _doc_topic_distances:
            for word in doc:
                assert not isinstance(word, complex)
        _doc_len = [len(doc) for doc in documents]
        _word_counts_map: dict = {}
        for doc in documents:
            for word in doc:
                _word_counts_map[word] = _word_counts_map.get(word, 0) + 1
        _word_counts: list = [_word_counts_map[term] for term in _vocabulary]
        _doc_topic_distances_ext: list = [[v if not math.isnan(v) else 1 / self.n_clusters for v in d] for d in _doc_topic_distances]
        _doc_topic_distances_ext = [d if sum(d) > 0 else [1 / self.n_clusters] * self.n_clusters for d in _doc_topic_distances_ext]
        for doc in _doc_topic_distances_ext:
            for f in doc:
                assert not isinstance(f, complex)
        assert (pd.DataFrame(_doc_topic_distances_ext).sum(axis=1) < 0.999).sum() == 0
        _word_matrix: list = []
        for cluster in self.cluster_word_distribution:
            _total: float = sum([frequency for word, frequency in cluster.items()])
            assert not math.isnan(_total)
            if _total == 0:
                _row: list = [(1 / len(_vocabulary))] * len(_vocabulary)
            else:
                _row: list = [cluster.get(word, 0) / _total for word in _vocabulary]
            for word in _row:
                assert not isinstance(word, complex)
            _word_matrix.append(_row)
        return pyLDAvis.prepare(topic_term_dists=_word_matrix,
                                doc_topic_dists=_doc_topic_distances_ext,
                                doc_lengths=_doc_len,
                                vocab=_vocabulary,
                                term_frequency=_word_counts,
                                R=30,
                                lambda_step=0.01,
                                sort_topics=False
                                )
Esempio n. 27
0
def tpc_vis(doc_wds_mat, tpc_wds_mat, doc_tpc_mat, vocab, outfile):
    data_input = []
    data_input.append([doc_wds_mat, tpc_wds_mat, doc_tpc_mat, vocab])

    data = {
        'topic_term_dists': data_input[1],
        'doc_topic_dists': data_input[2],
        'doc_lengths': data_input[0].sum(axis=1).A.squeeze(),
        'vocab': data_input[3],
        'term_frequency': data_input[0].sum(axis=0).A.squeeze()
    }

    vis_data = pyLDAvis.prepare(**data)
    pyLDAvis.save_html(vis_data, outfile)
Esempio n. 28
0
def single_pyLDAvis(N, fin_tmpl, fout_tmpl, mds):
    filename = fin_tmpl.format(n=N)
    #print (filename)
    model_data = LDAp.load_model_from_pkl(filename)
    vis_data = pyLDAvis.prepare(**model_data, mds=mds)
    # pyLDAvis 2D可视化降维演算法有['PCOA','TSNE','MMDS'] 三种可能
    # 文档来源https://pyldavis.readthedocs.io/en/latest/modules/API.html#pyLDAvis.prepared_data_to_html
    html_out = fout_tmpl.format(n=N, kind=mds)

    #d3, ldavis, ldavis_css的资源需要先下载好并放在对映的目录,相对於html_out目录
    pyLDAvis.save_html(vis_data, html_out,\
                  d3_url="js/d3.min.js", \
                  ldavis_url='js/ldavis.v1.0.0.js', \
                  ldavis_css_url='js/ldavis.v1.0.0.css')
    return (model_data, html_out)
Esempio n. 29
0
 def lda_vis(self, n_words=30, name='model'):
     '''
     DESC: Creates pyLDAvis figure. Requires LDA topic_analysis model
     --Input--
         n_words = number of words to display in the barcharts of figure
     ----------------------------------
     --Output--
         Returns pyLDAvis figure in html browser
     '''
     doc_lengths = [len(doc) for doc in self.corpus]
     vocab_lst = self.vectorizer.feature_names
     term_freq = textacy.vsm.get_doc_freqs(self.tfidf, normalized=False)
     topic_terms_tups = list(
         self.model.top_topic_terms(self.vectorizer.feature_names,
                                    topics=-1,
                                    top_n=len(vocab_lst),
                                    weights=True))
     lst = []
     for topic in topic_terms_tups:
         words = []
         for w in topic[1]:
             words.append(w)
         lst.append(words)
         topic_weight = []
         for topic in lst:
             weights = []
             for word in vocab_lst:
                 for we in topic:
                     if word == we[0]:
                         weights.append(we[1])
             topic_weight.append(weights)
     topic_term = np.array(topic_weight)
     self.ldavis = pyLDAvis.prepare(topic_term, \
                                     self.topic_matrix, \
                                     doc_lengths, \
                                     vocab_lst, \
                                     term_freq, \
                                     R=n_words, \
                                     mds='mmds', \
                                     sort_topics=False)
     pyLDAvis.save_html(self.ldavis, 'pyLDAvis_' + name)
     print('plotting...')
     pyLDAvis.show(self.ldavis)
Esempio n. 30
0
    def visualize_topics(self, notebook_mode = False, mds = 'pcoa'):
        """
        Print important topics based on decomposition.

        Parameters
        ----------
        mds : str, optional (default='pcoa')
            2D Decomposition. Allowed values:

            * ``'pcoa'`` - Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis (aka Classical Multidimensional Scaling)
            * ``'mmds'`` - Dimension reduction via Multidimensional scaling
            * ``'tsne'`` - Dimension reduction via t-distributed stochastic neighbor embedding
        """
        if not isinstance(mds, str):
            raise ValueError('mds must be a string')
        if not isinstance(notebook_mode, bool):
            raise ValueError('notebook_mode must be a boolean')
        try:
            import pyLDAvis
            import pyLDAvis.sklearn
        except:
            raise Exception(
                'pyldavis not installed. Please install it and try again.'
            )

        if notebook_mode:
            pyLDAvis.enable_notebook()

        vis_data = _prepare_topics(
            self._doc_embed,
            self._topic_embed,
            self._word_embed,
            np.array(self._features),
            doc_lengths = self._doc_len,
            term_frequency = self._freqs,
            normalize = True,
        )
        prepared_vis_data = pyLDAvis.prepare(**vis_data)
        if notebook_mode:
            return prepared_vis_data
        else:
            pyLDAvis.show(prepared_vis_data)
Esempio n. 31
0
def generate_ldavis_data(data_path, model, idx_to_word, freqs, vocab_size):
    """This method will launch a locally hosted session of
    pyLDAvis that will visualize the results of our model
    
    Parameters
    ----------
    data_path : str
        Location where your data is stored.
    model : Lda2Vec
        Loaded lda2vec tensorflow model. 
    idx_to_word : dict
        index to word mapping dictionary
    freqs list: 
        Frequencies of each token.
    vocab_size : int
        Total size of your vocabulary
    """

    doc_embed = model.sesh.run(model.mixture.doc_embedding)
    topic_embed = model.sesh.run(model.mixture.topic_embedding)
    word_embed = model.sesh.run(model.w_embed.embedding)

    # Extract all unique words in order of index 0-vocab_size
    vocabulary = []
    for k, v in idx_to_word.items():
        vocabulary.append(v)

    # Read in document lengths
    doc_lengths = np.load(data_path + "/doc_lengths.npy")

    # The prepare_topics function is a direct copy from Chris Moody
    vis_data = prepare_topics(doc_embed,
                              topic_embed,
                              word_embed,
                              np.array(vocabulary),
                              doc_lengths=doc_lengths,
                              term_frequency=freqs,
                              normalize=True)

    prepared_vis_data = pyLDAvis.prepare(**vis_data)
    pyLDAvis.display(prepared_vis_data)
Esempio n. 32
0
def pyLDA(topic_term_dists, doc_topic_dists, doc_lengths, vocab,
          term_frequency):
    """ use pyldavis show results in browser
    topic_term_dists : array-like, shape (`n_topics`, `n_terms`)
        Matrix of topic-term probabilities. Where `n_terms` is `len(vocab)`.
    doc_topic_dists : array-like, shape (`n_docs`, `n_topics`)
        Matrix of document-topic probabilities.
    doc_lengths : array-like, shape `n_docs`
        The length of each document, i.e. the number of words in each document.
        The order of the numbers should be consistent with the ordering of the
        docs in `doc_topic_dists`.
    vocab : array-like, shape `n_terms`
        List of all the words in the corpus used to train the model.
    term_frequency : array-like, shape `n_terms`
        The count of each particular term over the entire corpus. The ordering
        of these counts should correspond with `vocab` and `topic_term_dists`.
    """
    pyLDAvis.enable_notebook(True)
    data = pyLDAvis.prepare(topic_term_dists, doc_topic_dists, doc_lengths,
                            vocab, term_frequency)
    pyLDAvis.show(data)
Esempio n. 33
0
File: fcm.py Progetto: ecfm/fcm_cli
 def visualize(self):
     with torch.no_grad():
         doc_concept_probs = self.get_train_doc_concept_probs()
         # [n_concepts, vocab_size] weighted word counts of each concept
         concept_word_counts = torch.matmul(
             doc_concept_probs.transpose(0, 1), self.bow_train)
         # normalize word counts to word distribution of each concept
         concept_word_dists = concept_word_counts / concept_word_counts.sum(
             1, True)
         # fill NaN with 1/vocab_size in case a concept has all zero word distribution
         concept_word_dists[
             concept_word_dists !=
             concept_word_dists] = 1.0 / concept_word_dists.shape[1]
         vis_data = pyLDAvis.prepare(
             topic_term_dists=concept_word_dists.data.cpu().numpy(),
             doc_topic_dists=doc_concept_probs.data.cpu().numpy(),
             doc_lengths=self.doc_lens,
             vocab=self.vocab,
             term_frequency=self.word_counts)
         pyLDAvis.save_html(
             vis_data, os.path.join(self.out_dir, "visualization.html"))
def generate_ldavis_data(clean_data_dir, model, idx_to_word, freqs,
                         vocab_size):

    doc_embed = model.sesh.run(model.mixture.doc_embedding)
    topic_embed = model.sesh.run(model.mixture.topic_embedding)
    word_embed = model.sesh.run(model.w_embed.embedding)

    vocabulary = []
    for _, v in idx_to_word.items():
        vocabulary.append(v)

    doc_lengths = np.load(clean_data_dir + "/doc_lengths.npy")

    vis_data = prepare_topics(doc_embed,
                              topic_embed,
                              word_embed,
                              np.array(vocabulary),
                              doc_lengths=doc_lengths,
                              term_frequency=freqs)

    prepared_vis_data = pyLDAvis.prepare(**vis_data)
    pyLDAvis.show(prepared_vis_data)
Esempio n. 35
0
def lda_viz(docs, lengths, n_features, n_topics, n_top_words):
    n_samples = len(docs)

    norm = lambda data: pandas.DataFrame(data).div(data.sum(1),axis=0).values
    
    vect = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                    stop_words='english')
    vected = vect.fit_transform(docs)
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    doc_topic_dists = norm(lda.fit_transform(vected))
    
    prepared = pyLDAvis.prepare(
                        doc_lengths = lengths,
                        vocab = vect.get_feature_names(),
                        term_frequency = vected.sum(axis=0).tolist()[0],
                        topic_term_dists = norm(lda.components_),
                        doc_topic_dists = doc_topic_dists,
                        )

    #print(doc_topic_dists)
    #print(n_samples)
    return prepared, doc_topic_dists
Esempio n. 36
0
def main(start, end, increment):
    path = Path('C:/Data/Python/JobLoss')
    data_words = []
    with open(path / 'Processed.json') as f:
        data = json.load(f)
        for tweet in data:
            data_words.append(' '.join(tweet[1]))
    vec = CountVectorizer()
    X = vec.fit_transform(data_words).toarray()
    vocab = np.array(vec.get_feature_names())
    biterms = vec_to_biterms(X)
    for k in range(start, end, increment):
        print('Model %s' % k)
        btm = oBTM(num_topics=k, V=vocab)
        for i in range(0, len(biterms), chunksize):
            print('%s / %s' % (i, len(biterms)))
            biterms_chunk = biterms[i:i + chunksize]
            btm.fit(biterms_chunk, iterations=iterations)
        topics = btm.transform(biterms)
        vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X,
                                                                      axis=1),
                               vocab, np.sum(X, axis=0))
        pyLDAvis.save_html(
            vis, str(path / ('Visualizations/BTMVisualization%s.html' % k)))