Ejemplo n.º 1
0
def test_get_marginal_word_distrib(dtm, n_topics):
    dtm = np.array(dtm)
    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    model = lda.LDA(n_topics, 1)
    model.fit(dtm)

    doc_lengths = tmtoolkit.bow.bow_stats.get_doc_lengths(dtm)
    p_t = model_stats.get_marginal_topic_distrib(model.doc_topic_, doc_lengths)

    p_w = model_stats.get_marginal_word_distrib(model.topic_word_, p_t)
    assert p_w.shape == (dtm.shape[1], )
    assert np.isclose(p_w.sum(), 1.0)
    assert all(0 <= v <= 1 for v in p_w)
Ejemplo n.º 2
0
def test_get_word_distinctiveness(dtm, n_topics):
    dtm = np.array(dtm)
    if dtm.sum() == 0:  # assure that we have at least one word in the DTM
        dtm[0, 0] = 1

    model = lda.LDA(n_topics, 1)
    model.fit(dtm)

    doc_lengths = tmtoolkit.bow.bow_stats.get_doc_lengths(dtm)
    p_t = model_stats.get_marginal_topic_distrib(model.doc_topic_, doc_lengths)

    w_distinct = model_stats.get_word_distinctiveness(model.topic_word_, p_t)

    assert w_distinct.shape == (dtm.shape[1], )
    assert all(v >= 0 for v in w_distinct)
Ejemplo n.º 3
0
def save_ldamodel_summary_to_excel(excel_file, topic_word_distrib, doc_topic_distrib, doc_labels, vocab,
                                   top_n_topics=10, top_n_words=10, dtm=None,
                                   rank_label_fmt=None, topic_label_fmt=None):
    rank_label_fmt = rank_label_fmt or DEFAULT_RANK_NAME_FMT
    topic_label_fmt = topic_label_fmt or DEFAULT_TOPIC_NAME_FMT
    excel_writer = pd.ExcelWriter(excel_file)
    sheets = OrderedDict()

    # doc-topic distribution sheets
    sheets['top_doc_topics_vals'] = top_n_from_distribution(doc_topic_distrib, top_n=top_n_topics,
                                                            row_labels=doc_labels,
                                                            col_labels=rank_label_fmt)
    sheets['top_doc_topics_labels'] = top_n_from_distribution(doc_topic_distrib, top_n=top_n_topics,
                                                              row_labels=doc_labels,
                                                              col_labels=rank_label_fmt,
                                                              val_labels=topic_label_fmt)
    sheets['top_doc_topics_labelled_vals'] = ldamodel_top_doc_topics(doc_topic_distrib, doc_labels, top_n=top_n_topics)

    # topic-word distribution sheets
    sheets['top_topic_word_vals'] = top_n_from_distribution(topic_word_distrib, top_n=top_n_words,
                                                            row_labels=topic_label_fmt,
                                                            col_labels=rank_label_fmt)
    sheets['top_topic_word_labels'] = top_n_from_distribution(topic_word_distrib, top_n=top_n_words,
                                                              row_labels=topic_label_fmt,
                                                              col_labels=rank_label_fmt,
                                                              val_labels=vocab)
    sheets['top_topic_words_labelled_vals'] = ldamodel_top_topic_words(topic_word_distrib, vocab, top_n=top_n_words)

    if dtm is not None:
        doc_lengths = get_doc_lengths(dtm)
        marg_topic_distr = get_marginal_topic_distrib(doc_topic_distrib, doc_lengths)
        row_names = [DEFAULT_TOPIC_NAME_FMT.format(i0=i, i1=i + 1) for i in range(len(marg_topic_distr))]
        sheets['marginal_topic_distrib'] = pd.DataFrame(marg_topic_distr, columns=['marginal_topic_distrib'],
                                                        index=row_names)

    for sh_name, sh_data in sheets.items():
        sh_data.to_excel(excel_writer, sh_name)

    excel_writer.save()

    return sheets
Ejemplo n.º 4
0
    def __init__(self, model, 
                       corpus, 
                       gensim_dict,
                       n_topics):

        self.model = model
        self.corpus = corpus
        self.gensim_dict = gensim_dict
        self.n_topics = n_topics
    
        self.vocab_size = len(gensim_dict)
        self.n_docs = len(corpus)
    
        lda_id = gensim_dict.token2id
        self.vocab = np.asarray(list(lda_id.keys()))   
    
        self.dense = gensim.matutils.corpus2dense(corpus, 
                                                  num_terms = self.vocab_size, 
                                                  num_docs = self.n_docs).astype(int).T 
        self.doc_lengths = get_doc_lengths(self.dense)
    
        doc_topic = np.ones([self.n_docs, n_topics])
    
        for token in range(self.n_docs):
            top_dist = self.model.get_document_topics(corpus[token], 
                                                      minimum_probability=0.0, 
                                                      per_word_topics=False)
            perc = [tb[1] for tb in top_dist]
            doc_topic[token, :] = perc 
    
        self.doc_topic = doc_topic
        self.p_t = get_marginal_topic_distrib(doc_topic, self.doc_lengths)
    
        self.topic_word = self.model.get_topics()    
    
        print('\n- {} topics learnt from {} documents with vocabulary size of {} '\
                      'unique words.\n\n'.format(n_topics, self.n_docs, self.vocab_size))   
    
        print('Marginal topic distribution Pr(Topic):\n{}'.format(self.p_t))                 
Ejemplo n.º 5
0
doc_meta = pd.merge(doc_meta, sess_dates, how='left', on='sess_id')
assert sum(doc_meta.date.isna()) == 0

#%% calculate marginal topic distribution per party

# marginal topic distribution also takes the documents' lengths into account
# -> longer speeches' topics get more "weight"
doc_lengths = get_doc_lengths(dtm)

stats_per_party = {}
for party, grp in doc_meta.groupby('party'):
    party_speeches_ind = np.where(np.isin(doc_labels, grp.doc_label))[0]

    party_doc_topic = theta[party_speeches_ind, :]
    party_doc_lengths = doc_lengths[party_speeches_ind]
    party_marginal_topic = get_marginal_topic_distrib(party_doc_topic, party_doc_lengths)

    stats_per_party[party] = (party_marginal_topic, len(grp))


#%% plot marginal topic proportion per party

n_parties = len(stats_per_party)
n_top_topics = 5
n_top_words = 8
fig, axes = plt.subplots(3, 2, sharex=True, figsize=(12, 8), constrained_layout=True)
axes = axes.flatten()

fig.suptitle(u'Top %d marginal topic proportions per party' % n_top_topics, fontsize='medium')
fig.subplots_adjust(top=0.925)