def pp_inference_tm(state_dwz_, dict_params_, dict_args_):
    try:
        path_tm_ = dict_args_['path_tm']
    except KeyError:
        print('specify path to tm. default might not work.')
    # # parameters
    D_ = dict_params_['D']
    V_ = dict_params_['V']
    K_ = dict_params_['K']

    # # convert state into corpus
    n_wd_, n_wj_, n_jd_ = state_nwjd(state_dwz_, D_, V_, K_)
    texts_ = nwd_to_texts(n_wd_)

    state_dwz_infer_, n_wj_infer_, n_jd_infer_, K_infer_ = tm_inference(
        path_tm_, texts_)
    # # get the nmi and the rest
    nmi = state_dwz_nmi(state_dwz_, state_dwz_infer_, K_, K_infer_)

    # #
    p_dt_infer = np.transpose(n_jd_infer_ / float(np.sum(n_jd_infer_)))

    list_t_d_infer = predict_topic_p_td_unsup(p_dt_infer)

    return nmi, K_infer_, p_dt_infer, list_t_d_infer
def make_dict_corpus_for_inference(dict_output_corpus):
    '''
    Take output from topicmodel_synthetic_front [generating the synthetic corpus]
    and put in the form that it can be used by topicmodel_inference_front [inferring the synthetic corpus]
    IN:
    -dict, contains 'state_dwz','p_wt','p_td'
    OUT:
    - dict, contains  'texts_list_shuffle', 'state_dwz_shuffle'
    '''
    state_dwz = dict_output_corpus['state_dwz']
    p_wt = dict_output_corpus['p_wt']
    p_td = dict_output_corpus['p_td']
    V, K = np.shape(p_wt)
    K, D = np.shape(p_td)
    # ## convert state into corpus
    n_wd, n_wj, n_jd = state_nwjd(state_dwz, D, V, K)
    texts = nwd_to_texts(n_wd.astype('int'))
    dict_corpus_tmp = {
        'texts_list_shuffle': texts,
        'state_dwz_shuffle': state_dwz
    }
    return dict_corpus_tmp
Beispiel #3
0
def synthetic_single_stopword_terminal(V=1000,
                                       K=5,
                                       D=100,
                                       m=100,
                                       dist_w='uni',
                                       dist_t=None,
                                       dist_stop='uni',
                                       p_s=0,
                                       c_w=1,
                                       c_t=None,
                                       seed=None,
                                       burstiness=None,
                                       if_info=0):
    '''
    Output:
    p_w_td: p(w|t,d), in general, for each d, p(w|t,d) = p(w|t); however, for the burstiness case, p(w|t,d) is different for each document
    '''

    if dist_t is None:
        dist_t = dist_w

    if c_t is None:
        c_t = c_w

    # Get global word distribution
    # p_w = get_global_word_distribution_pw(V, marker_pw )
    p_w = get_pw_pt(V, dist=dist_w)

    # Get stopword distribution
    # stop_distrib = get_global_word_distribution_pw(V , marker_stop )
    stop_distrib = get_pw_pt(V, dist=dist_stop)

    # Choose stopword list
    num_stopword = int(V * p_s)
    np.random.seed(seed=seed)
    stopword_list = np.random.choice(V,
                                     size=num_stopword,
                                     replace=False,
                                     p=stop_distrib)

    # Get the number of word type in each topic
    num_nonstop = V - num_stopword
    V_t = get_vt_from_nonstop(
        K, num_nonstop, dist_t
    )  # V_t is the topic size for each topic, i.e., the number of useful non-stopwords in each topic

    # Get the topic assignment for each words: both stopwords and non-stopwords.
    # For stopwords, assign a very large number as their topic id.
    word_topic_assign_list = get_word_topic_assign_list(V_t,
                                                        stopword_list,
                                                        seed=seed)

    # Get topic distribution p_t
    p_t = get_topic_distribution_p_t(K, p_w, word_topic_assign_list)

    # Get word-topic distribution
    p_wt = get_word_topic_distribution_p_wt(K, p_w, p_t,
                                            word_topic_assign_list, c_w)

    # Get the topic assignment for each document
    document_topic_assign_list = np.random.choice(K,
                                                  size=D,
                                                  replace=True,
                                                  p=p_t)

    # Get topic-document distribution
    p_td = get_topic_doc_distribution_ptd(K, p_t, c_t,
                                          document_topic_assign_list)

    # Get the synthetic corpus
    state_dwz, p_w_td = draw_dwz_from_ptd_pwt(p_td,
                                              p_wt,
                                              m,
                                              burstiness=burstiness)
    n_wd, n_wj, n_jd = state_nwjd(state_dwz, D, V, K)
    texts = nwd_to_texts(n_wd)

    # Get the output dictionarr

    dict_out = {}
    dict_out['p_w'] = p_w
    dict_out['V_t'] = V_t
    dict_out['word_topic_assign_list'] = word_topic_assign_list
    dict_out['p_t'] = p_t
    dict_out['p_wt'] = p_wt
    dict_out['p_w_td'] = p_w_td
    dict_out['document_topic_assign_list'] = document_topic_assign_list
    dict_out['p_td'] = p_td
    dict_out['state_dwz'] = state_dwz
    dict_out['n_wd'] = n_wd
    dict_out['n_wj'] = n_wj
    dict_out['n_jd'] = n_jd
    dict_out['texts'] = texts

    # Get the structure
    if if_info:
        DeltaI, I_alpha = deltaI_from_nwd(n_wd)
        dict_out['DeltaI'] = DeltaI
        dict_out['I_alpha'] = I_alpha

    return dict_out
def synthetic_dirichlet_terminal(V,
                                 K,
                                 dist_w,
                                 D,
                                 m,
                                 alpha,
                                 beta=None,
                                 dist_t=None,
                                 seed=None,
                                 burstiness=None,
                                 if_info=True):

    if dist_t is None:
        dist_t = dist_w
    if beta is None:
        beta = 1.0 * alpha

    # # global distribution of topic-size
    p_t = get_pw_pt(K, dist=dist_t)
    # # global distribution of word frequencies
    p_w = get_pw_pt(V, dist=dist_w)

    # # get vector-hyperparameters
    vec_alpha = make_hyper_vec(alpha, p_t)
    vec_beta = make_hyper_vec(beta, p_w)

    # # create the mixture-matrices p_wt (word-topic) and p_td (topic-doc)
    p_td, p_wt = make_pwt_ptd_dirichlet(vec_alpha, vec_beta, D, seed=seed)

    # # draw the dwz-state
    state_dwz, p_w_td = draw_dwz_from_ptd_pwt(p_td,
                                              p_wt,
                                              m,
                                              burstiness=burstiness)
    n_wd, n_wj, n_jd = state_nwjd(state_dwz, D, V, K)
    texts = nwd_to_texts(n_wd)
    # # infer the topic-membership of each doc:
    # # choose topic with largest contribution from p(t|d)
    list_t_d_true = np.argmax(p_td, axis=0)

    # # empirical p_t and p_w; otherwise p_tw is not normalized
    p_t_emp = 1.0 / D * np.sum(p_td, axis=1)
    p_w_emp = np.sum(p_wt * p_t_emp)

    # # infer the topic-membership of each word:
    # # choose topic with largest contribution from p(t|w) = p(w|t)*p(t)/p(w)
    # p_tw = (p_wt*(p_w[:,np.newaxis]/p_t)).T
    p_tw = (p_wt.T * (p_t_emp[:, np.newaxis] / p_w_emp))
    list_t_w_true = np.argmax(p_tw, axis=0)

    # # Get the structure
    if if_info:
        DeltaI, I_alpha = deltaI_from_nwd(n_wd)
    else:
        DeltaI = 0
        I_alpha = 0

    dict_out = {}
    dict_out['p_w'] = p_w
    # dict_out['V_t'] =V_t
    dict_out['word_topic_assign_list'] = list_t_w_true
    dict_out['p_t'] = p_t
    dict_out['p_wt'] = p_wt
    dict_out['document_topic_assign_list'] = list_t_d_true
    dict_out['p_td'] = p_td
    dict_out['state_dwz'] = state_dwz
    dict_out['n_wd'] = n_wd
    dict_out['n_wj'] = n_wj
    dict_out['n_jd'] = n_jd
    dict_out['texts'] = texts
    dict_out['p_tw'] = p_tw

    dict_out['DeltaI'] = DeltaI
    dict_out['I_alpha'] = I_alpha

    return dict_out