def pp_inference_tm(state_dwz_, dict_params_, dict_args_): try: path_tm_ = dict_args_['path_tm'] except KeyError: print('specify path to tm. default might not work.') # # parameters D_ = dict_params_['D'] V_ = dict_params_['V'] K_ = dict_params_['K'] # # convert state into corpus n_wd_, n_wj_, n_jd_ = state_nwjd(state_dwz_, D_, V_, K_) texts_ = nwd_to_texts(n_wd_) state_dwz_infer_, n_wj_infer_, n_jd_infer_, K_infer_ = tm_inference( path_tm_, texts_) # # get the nmi and the rest nmi = state_dwz_nmi(state_dwz_, state_dwz_infer_, K_, K_infer_) # # p_dt_infer = np.transpose(n_jd_infer_ / float(np.sum(n_jd_infer_))) list_t_d_infer = predict_topic_p_td_unsup(p_dt_infer) return nmi, K_infer_, p_dt_infer, list_t_d_infer
def make_dict_corpus_for_inference(dict_output_corpus): ''' Take output from topicmodel_synthetic_front [generating the synthetic corpus] and put in the form that it can be used by topicmodel_inference_front [inferring the synthetic corpus] IN: -dict, contains 'state_dwz','p_wt','p_td' OUT: - dict, contains 'texts_list_shuffle', 'state_dwz_shuffle' ''' state_dwz = dict_output_corpus['state_dwz'] p_wt = dict_output_corpus['p_wt'] p_td = dict_output_corpus['p_td'] V, K = np.shape(p_wt) K, D = np.shape(p_td) # ## convert state into corpus n_wd, n_wj, n_jd = state_nwjd(state_dwz, D, V, K) texts = nwd_to_texts(n_wd.astype('int')) dict_corpus_tmp = { 'texts_list_shuffle': texts, 'state_dwz_shuffle': state_dwz } return dict_corpus_tmp
def synthetic_single_stopword_terminal(V=1000, K=5, D=100, m=100, dist_w='uni', dist_t=None, dist_stop='uni', p_s=0, c_w=1, c_t=None, seed=None, burstiness=None, if_info=0): ''' Output: p_w_td: p(w|t,d), in general, for each d, p(w|t,d) = p(w|t); however, for the burstiness case, p(w|t,d) is different for each document ''' if dist_t is None: dist_t = dist_w if c_t is None: c_t = c_w # Get global word distribution # p_w = get_global_word_distribution_pw(V, marker_pw ) p_w = get_pw_pt(V, dist=dist_w) # Get stopword distribution # stop_distrib = get_global_word_distribution_pw(V , marker_stop ) stop_distrib = get_pw_pt(V, dist=dist_stop) # Choose stopword list num_stopword = int(V * p_s) np.random.seed(seed=seed) stopword_list = np.random.choice(V, size=num_stopword, replace=False, p=stop_distrib) # Get the number of word type in each topic num_nonstop = V - num_stopword V_t = get_vt_from_nonstop( K, num_nonstop, dist_t ) # V_t is the topic size for each topic, i.e., the number of useful non-stopwords in each topic # Get the topic assignment for each words: both stopwords and non-stopwords. # For stopwords, assign a very large number as their topic id. word_topic_assign_list = get_word_topic_assign_list(V_t, stopword_list, seed=seed) # Get topic distribution p_t p_t = get_topic_distribution_p_t(K, p_w, word_topic_assign_list) # Get word-topic distribution p_wt = get_word_topic_distribution_p_wt(K, p_w, p_t, word_topic_assign_list, c_w) # Get the topic assignment for each document document_topic_assign_list = np.random.choice(K, size=D, replace=True, p=p_t) # Get topic-document distribution p_td = get_topic_doc_distribution_ptd(K, p_t, c_t, document_topic_assign_list) # Get the synthetic corpus state_dwz, p_w_td = draw_dwz_from_ptd_pwt(p_td, p_wt, m, burstiness=burstiness) n_wd, n_wj, n_jd = state_nwjd(state_dwz, D, V, K) texts = nwd_to_texts(n_wd) # Get the output dictionarr dict_out = {} dict_out['p_w'] = p_w dict_out['V_t'] = V_t dict_out['word_topic_assign_list'] = word_topic_assign_list dict_out['p_t'] = p_t dict_out['p_wt'] = p_wt dict_out['p_w_td'] = p_w_td dict_out['document_topic_assign_list'] = document_topic_assign_list dict_out['p_td'] = p_td dict_out['state_dwz'] = state_dwz dict_out['n_wd'] = n_wd dict_out['n_wj'] = n_wj dict_out['n_jd'] = n_jd dict_out['texts'] = texts # Get the structure if if_info: DeltaI, I_alpha = deltaI_from_nwd(n_wd) dict_out['DeltaI'] = DeltaI dict_out['I_alpha'] = I_alpha return dict_out
def synthetic_dirichlet_terminal(V, K, dist_w, D, m, alpha, beta=None, dist_t=None, seed=None, burstiness=None, if_info=True): if dist_t is None: dist_t = dist_w if beta is None: beta = 1.0 * alpha # # global distribution of topic-size p_t = get_pw_pt(K, dist=dist_t) # # global distribution of word frequencies p_w = get_pw_pt(V, dist=dist_w) # # get vector-hyperparameters vec_alpha = make_hyper_vec(alpha, p_t) vec_beta = make_hyper_vec(beta, p_w) # # create the mixture-matrices p_wt (word-topic) and p_td (topic-doc) p_td, p_wt = make_pwt_ptd_dirichlet(vec_alpha, vec_beta, D, seed=seed) # # draw the dwz-state state_dwz, p_w_td = draw_dwz_from_ptd_pwt(p_td, p_wt, m, burstiness=burstiness) n_wd, n_wj, n_jd = state_nwjd(state_dwz, D, V, K) texts = nwd_to_texts(n_wd) # # infer the topic-membership of each doc: # # choose topic with largest contribution from p(t|d) list_t_d_true = np.argmax(p_td, axis=0) # # empirical p_t and p_w; otherwise p_tw is not normalized p_t_emp = 1.0 / D * np.sum(p_td, axis=1) p_w_emp = np.sum(p_wt * p_t_emp) # # infer the topic-membership of each word: # # choose topic with largest contribution from p(t|w) = p(w|t)*p(t)/p(w) # p_tw = (p_wt*(p_w[:,np.newaxis]/p_t)).T p_tw = (p_wt.T * (p_t_emp[:, np.newaxis] / p_w_emp)) list_t_w_true = np.argmax(p_tw, axis=0) # # Get the structure if if_info: DeltaI, I_alpha = deltaI_from_nwd(n_wd) else: DeltaI = 0 I_alpha = 0 dict_out = {} dict_out['p_w'] = p_w # dict_out['V_t'] =V_t dict_out['word_topic_assign_list'] = list_t_w_true dict_out['p_t'] = p_t dict_out['p_wt'] = p_wt dict_out['document_topic_assign_list'] = list_t_d_true dict_out['p_td'] = p_td dict_out['state_dwz'] = state_dwz dict_out['n_wd'] = n_wd dict_out['n_wj'] = n_wj dict_out['n_jd'] = n_jd dict_out['texts'] = texts dict_out['p_tw'] = p_tw dict_out['DeltaI'] = DeltaI dict_out['I_alpha'] = I_alpha return dict_out
def tm_inference_terminal(texts, state_dwz_true=None, k_true=None, flag_coherence=0, path_tm=os.path.abspath( os.path.join(os.pardir, 'src/external/topicmapping'))): ''' Do the inference for p_dt and state_dwz_ (optional) Input: ## provide corpus and number of topics if need , 'texts':texts ## optional, only works for synthetic corpus with token labeling , 'state_dwz_true': state_dwz , 'k_true': K ## optional , 'path_tm': os.path.abspath(os.path.join(os.pardir,'src/external/topicmapping')) Output: dict_output = { 'p_td_infer': p_td ## p_td inferred by topic modle , 'token_labeling_nmi': nmi ## optional, results for token labeling, only for synthetic data , 'k_infer': inferred number of topics } ''' ############################# # # Generate a empty dic for output dict_output = {} ############################# # # inference for p_dt train_dir = make_path_tmp_tm() train_fname = texts_corpus_tm(texts, train_dir) dir_cwd = os.getcwd() os.chdir(path_tm) cmd_tm = './bin/topicmap -f %s -o %stest_result' % (train_fname, train_dir) # os.system(cmd_tm) p = subprocess.Popen(cmd_tm, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) p.wait() os.chdir(dir_cwd) # ############################ # # get p_td_tm: p_td_tm = tm_inference_get_p_td_tm(train_dir) dict_output['p_td_infer'] = p_td_tm # ############################ # # get p_wt_tm: p_wt_tm = tm_inference_get_p_wt_tm(train_dir) dict_output['p_wt_infer'] = p_wt_tm # ############################ # # get the number of topics: f = open(train_dir + 'test_result/lda_class_words.txt', 'r') x = f.readlines() f.close() k_tm = len(x) dict_output['k_infer'] = k_tm if flag_coherence == 1: state_dwz_tm = tm_inference_get_state_dwz_tm(train_dir) dict_gs = corpora.Dictionary(texts) all_terms = list(dict_gs.iterkeys()) V = len(all_terms) D = len(texts) n_wd_, n_wj_, n_jd_ = state_nwjd(state_dwz_tm, D, V, k_tm) dict_output['coherence'] = topic_cherence_C(n_wd_, n_wj_) # ############################ # # Get the nmi for token_labeling state_dwz_tm = tm_inference_get_state_dwz_tm(train_dir) dict_output['state_dwz_infer'] = state_dwz_tm if state_dwz_true is not None: dict_output_token_labeling = get_dict_output_token_labeling( state_dwz_true, state_dwz_tm, k_true, k_tm) dict_output.update(dict_output_token_labeling) os.system('rm -rf %s' % (train_dir)) return dict_output
def ldags_inference_terminal(texts, input_k, state_dwz_true=None, k_true=None, input_v=None, path_mallet=None, dN_opt=0, iterations=1000, alpha=50.0, beta=0.01, flag_coherence=0): ''' Do the inference for p_dt and state_dwz_ (optional) Input: ## provide corpus and number of topics if need , 'texts':texts , 'input_k': K ## optional, only works for synthetic corpus with token labeling , 'state_dwz_true': state_dwz , 'k_true': K ## optional , 'input_v': V # only need for 'ldavb' token labeling , 'path_mallet': os.path.abspath(os.path.join(os.pardir,'src/external/mallet-2.0.8RC3/bin/mallet')) , 'dN_opt':0 , 'iterations':1000 Output: dict_output = { 'p_td_infer': p_td ## p_td inferred by topic modle , 'token_labeling_nmi': nmi ## optional, results for token labeling, only for synthetic data } ''' # # Generate a empty dic for output dict_output = {} # # inference for p_dt if input_v is not None: # # for the synthetic corpurs with token labeling dict_gs = gs.corpora.Dictionary([[str(i)] for i in range(input_v)]) else: # # for real-world corpus and the synthetic corpurs without token labeling dict_gs = corpora.Dictionary(texts) corpus_gs = [dict_gs.doc2bow(text) for text in texts] D = len(texts) path_tmp = make_path_tmp_lda() model = LdaMallet(path_mallet, corpus_gs, num_topics=input_k, id2word=dict_gs, prefix=path_tmp, iterations=iterations, optimize_interval=dN_opt, workers=1, alpha=alpha, beta=beta) # print("iterations limit: %s" % (iterations)) # <<< infer p(t|d) fdoctopics_path = model.fdoctopics() with open(fdoctopics_path, "r") as text_file: lines = text_file.readlines() p_d_t_ldamallet = np.zeros([D, input_k]) for d_num in range(D): t_d_oneline_str = lines[d_num] t_d_oneline_list = t_d_oneline_str.strip('\n').split('\t')[2:] for t_num in range(input_k): p_d_t_ldamallet[d_num, t_num] = t_d_oneline_list[t_num] dict_output['p_td_infer'] = p_d_t_ldamallet # >>> # <<< Get the nmi for token_labeling fname_labels = path_tmp + 'state.mallet.gz' state_dwz_infer, alpha_, beta_ = state_read_mallet(fname_labels) # print('set_LDAGS_alpha: %s, set_LDAVB_eta: %s' % (alpha_[0], beta_)) # print('length_LDAGS_alpha: %s, length_LDAVB_eta: %s' % (len(alpha_), 1)) if state_dwz_true is not None: # nmi = state_dwz_nmi(state_dwz_true, state_dwz_infer, k_true, input_k) dict_output_token_labeling = get_dict_output_token_labeling( state_dwz_true, state_dwz_infer, k_true, input_k) dict_output.update(dict_output_token_labeling) # In general, we do not need to output state_dwz_infer dict_output['state_dwz_infer'] = state_dwz_infer # >>> if flag_coherence == 1: all_terms = list(dict_gs.iterkeys()) V = len(all_terms) n_wd_, n_wj_, n_jd_ = state_nwjd(state_dwz_infer, D, V, input_k) dict_output['coherence'] = topic_cherence_C(n_wd_, n_wj_) # <<< infer p(w|t) D = max(np.array(state_dwz_infer)[:, 0]) + 1 V = max(np.array(state_dwz_infer)[:, 1]) + 1 K = input_k n_wd_infer, n_wj_infer, n_jd_infer = state_nwjd(state_dwz_infer, D, V, K) beta_tmp = beta beta_array = np.ones([V, 1]) * beta_tmp n_wj_beta_array = n_wj_infer + beta_array n_wj_beta_array_vector = np.sum(n_wj_infer + beta_array, axis=0) p_wt_infer = n_wj_beta_array / n_wj_beta_array_vector dict_output['p_wt_infer'] = p_wt_infer # >>> os.system('rm -rf %s' % (path_tmp)) return dict_output
def hdp_inference_terminal(texts, state_dwz_true=None, k_true=None, flag_coherence=0, path_hdp=os.path.abspath( os.path.join( os.pardir, 'src/external/hdp-bleilab/hdp-faster'))): ''' Do the inference for p_dt and state_dwz_ (optional) Input: ## provide corpus and number of topics if need , 'texts':texts ## optional, only works for synthetic corpus with token labeling , 'state_dwz_true': state_dwz , 'k_true': K ## optional , 'path_hdp': os.path.abspath(os.path.join(os.pardir,'src/external/hdp-bleilab/hdp-faster')) Output: dict_output = { 'p_td_infer': p_td ## p_td inferred by topic modle , 'token_labeling_nmi': nmi ## optional, results for token labeling, only for synthetic data , 'k_infer': number of topics inferred by topic model } ''' ############################# # # Generate a empty dic for output dict_output = {} # ############################ # # inference for p_dt train_dir = make_path_tmp_hdp() train_fname = texts_corpus_hdp(texts, train_dir) dir_cwd = os.getcwd() os.chdir(path_hdp) cmd_hdp = './hdp --train_data %s --directory %s' % (train_fname, train_dir) p = subprocess.Popen(cmd_hdp, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) p.wait() os.chdir(dir_cwd) # # doc-topic counts f = open(train_dir + 'final.doc.states', 'r') x = f.readlines() f.close() D_ = len(x) K_hdp = len(x[0].split()) p_td_hdp = np.zeros((D_, K_hdp)) for i_d, d in enumerate(x): n_j_tmp = np.array([int(h_) for h_ in d.split()]) p_td_hdp[i_d, :] = n_j_tmp / float(np.sum(n_j_tmp)) # os.system('rm -rf %s'%( train_dir)) dict_output['p_td_infer'] = p_td_hdp # ############################ # # get the number of topics: f = open(train_dir + 'final.topics', 'r') x = f.readlines() f.close() k_hdp = len(x) dict_output['k_infer'] = k_hdp # ############################ # # individual labels f = open(train_dir + 'final.word-assignments', 'r') header = f.readline() x = f.readlines() f.close() state_dwz_hdp = [tuple([int(h_) for h_ in h.split()]) for h in x] dict_output['state_dwz_infer'] = state_dwz_hdp if flag_coherence == 1: dict_gs = corpora.Dictionary(texts) all_terms = list(dict_gs.iterkeys()) V = len(all_terms) D = len(texts) n_wd_, n_wj_, n_jd_ = state_nwjd(state_dwz_hdp, D, V, k_hdp) dict_output['coherence'] = topic_cherence_C(n_wd_, n_wj_) # ############## # # infer p_wt all_word_list = [i[1] for i in state_dwz_hdp] n_w = max(all_word_list) + 1 num_k = k_hdp p_wt_infer = np.zeros([n_w, num_k]) for i in state_dwz_hdp: tmp_w = i[1] tmp_t = i[2] p_wt_infer[tmp_w, tmp_t] += 1 p_wt_infer = p_wt_infer / p_wt_infer.sum(axis=0) dict_output['p_wt_infer'] = p_wt_infer # # Get the nmi for token_labeling if state_dwz_true is not None: dict_output_token_labeling = get_dict_output_token_labeling( state_dwz_true, state_dwz_hdp, k_true, k_hdp) dict_output.update(dict_output_token_labeling) os.system('rm -rf %s' % (train_dir)) return dict_output
def topic_coherence_from_state_dwz(state_dwz,input_k,n=10, eps=1.0): D = len(set([dwz[0] for dwz in state_dwz])) ## number of documents V = len(set([dwz[1] for dwz in state_dwz])) n_wd_, n_wj_, n_jd_ = state_nwjd(state_dwz, D, V, input_k) C = topic_cherence_C(n_wd_, n_wj_, n=n,eps=eps) return np.mean(C)