def test_empty_uid(): cps = tp.utils.Corpus() cps.add_doc("test text".split()) cps.add_doc("test text".split()) cps.add_doc("test text".split()) mdl = tp.HDPModel(corpus=cps) assert len(cps) == len(mdl.docs) assert cps[0].uid == mdl.docs[0].uid mdl.train(0) mdl = tp.HDPModel() ccps = mdl.add_corpus(cps) mdl.add_corpus(ccps)
def test_hdp_to_lda(): mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_df=5, rm_top=5, alpha=0.5, gamma=0.5, initial_k=5) for n, line in enumerate(open('test/sample.txt', encoding='utf-8')): ch = line.strip().split() mdl.add_doc(ch) mdl.burn_in = 100 mdl.train(0) print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words) print('Removed top words:', mdl.removed_top_words) for i in range(0, 1000, 10): mdl.train(10) print( 'Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}\tNum. of tables: {}' .format(i, mdl.ll_per_word, mdl.live_k, mdl.num_tables)) lda, topic_mapping = mdl.convert_to_lda(topic_threshold=1e-3) print(topic_mapping) for i in range(0, 100, 10): lda.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, lda.ll_per_word)) for k in range(lda.k): print('Topic #{} ({})'.format(k, lda.get_count_by_topics()[k])) for word, prob in lda.get_topic_words(k): print('\t', word, prob, sep='\t')
def hdp_example(input_file, save_path): mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5) for n, line in enumerate(open(input_file, encoding='utf-8')): ch = line.strip().split() mdl.add_doc(ch) mdl.burn_in = 100 mdl.train(0) print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words) print('Removed top words:', mdl.removed_top_words) print('Training...', file=sys.stderr, flush=True) for i in range(0, 1000, 10): mdl.train(10) print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format( i, mdl.ll_per_word, mdl.live_k)) print('Saving...', file=sys.stderr, flush=True) mdl.save(save_path, True) important_topics = [ k for k, v in sorted(enumerate(mdl.get_count_by_topics()), key=lambda x: x[1], reverse=True) ] for k in important_topics: if not mdl.is_live_topic(k): continue print('Topic #{}'.format(k)) for word, prob in mdl.get_topic_words(k): print('\t', word, prob, sep='\t')
def train_hdp(corpus, initial_k=10, term_weight=tp.TermWeight.PMI, gamma=1, alpha=0.1, iterations=2000): """ Train a heirarchical dirichlet process topic model """ hdp = tp.HDPModel(tw=term_weight, gamma=1, alpha=0.1, initial_k=initial_k, seed=1000) # add samples to model random.seed(1000) random.shuffle(corpus) for c in corpus: hdp.add_doc(c) # discard the first N samples hdp.burn_in = 1000 hdp.train(0) for i in range(0, iterations + 1, 100): hdp.train(100) print( f'{i = }\tlog-likelihood = {hdp.ll_per_word}\ttopics = {hdp.live_k}' ) return hdp
def tp_one_trial(dataset, model_type, topic_size, sample_size, min_cf=3, rm_top=5, max_iter=1000, min_iter=None, checkpoint=None, stop_increase=1, metric='ll'): assert model_type in ['lda', 'ctm', 'slda', 'hdp'], f'invalid `model_type`: {model_type}...' assert metric in ['ll', 'pp'], f'invalid `metric`: {metric}...' if model_type == 'lda': model = tp.LDAModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top) if model_type == 'ctm': model = tp.CTModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top) if model_type == "slda": model = tp.SLDAModel(k=topic_size,vars="b", tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top) if model_type == 'hdp': model = tp.HDPModel(initial_k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top) sample_size = min(sample_size, len(dataset)) # max_iter = max_iter * sample_size * topic_size // 2000 # ensure the number of iterations increases with the size of sample model.burn_in = max_iter // 5 # set burn-in: 20 percent of max iterations for i in range(sample_size): doc, label = dataset[i] if model_type == "slda": model.add_doc(doc,[float(label),]) else: model.add_doc(doc) if min_iter is None: min_iter = max_iter // 5 if checkpoint is None: checkpoint = max_iter // 5 model.train(min_iter) pre_metric = - np.infty stop_increase_cnt = 0. cur_metric = 0. for i in range(1, max_iter+1): model.train(1) # Metric is always larger, better if metric == 'll': cur_metric += model.ll_per_word if metric == 'pp': cur_metric += - model.perplexity # smaller perplexity is better. if i % checkpoint == 0: cur_metric /= checkpoint print(f'Current loss: {cur_metric:.5f}') if cur_metric >= pre_metric: pre_metric = cur_metric else: stop_increase_cnt += 1 cur_metric = 0. if stop_increase_cnt >= stop_increase: break final_metric = model.perplexity if metric == 'pp' else model.ll_per_word print(f'Trial iterations: {i + min_iter}.') return model, final_metric
def hdp_model(self, text_data, save_path): mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5) index = 0 for doc in text_data: print(str(index) + " : " + str(doc)) mdl.add_doc(doc) index += 1 mdl.burn_in = 100 mdl.train(0) print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words) print('Removed top words:', mdl.removed_top_words) print('Training...', file=sys.stderr, flush=True) for i in range(0, 1000, 10): mdl.train(10) print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format(i, mdl.ll_per_word, mdl.live_k)) print('Saving...', file=sys.stderr, flush=True) mdl.save(save_path, True) topic_num = 0 # extract candidates for auto topic labeling extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000) cands = extractor.extract(mdl) # ranking the candidates of labels for a specific topic labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25) important_topics = [k for k, v in sorted(enumerate(mdl.get_count_by_topics()), key=lambda x: x[1], reverse=True)] for k in important_topics: if not mdl.is_live_topic(k): continue print("== Topic #{} ==".format(k)) print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5))) for word, prob in mdl.get_topic_words(k, top_n=10): print(word, prob, sep='\t') print() topic_num+=1 return (mdl, topic_num)
def hdp_param_checker(tw=tp.TermWeight.IDF, min_cf_0=0, min_cf_f=1, min_cf_s=1, min_df_0=0, min_df_f=1, min_df_s=1, rm_top_0=0, rm_top_f=1, rm_top_s=1, k0_0=2, k0_f=12, k0_s=3, alpha_0=-1, alpha_f=0, alpha_s=1, eta_0=0, eta_f=1, eta_s=1, gamma_0=0, gamma_f=1, gamma_s=1, seed=101, corpus=None, burn=100, train=1001, word_list=None, card_count=30, to_excel=False, fname='param_checking.xlsx'): """ Method to automatically iterate through different HDP parameters to compare results Parameters tw: Union[int, TermWeight] term weighting scheme in https://bab2min.github.io/tomotopy/v0.8.0/en/#tomotopy.TermWeight ; I chose the default to be inverse document frequency, which means that cards that appear in almost all decks are weighted lower than cards that appear in very few decks. min_cf_0: int Starting minimum card collection frequency min_cf_f: int Ending minimum card collection frequency min_cf_s: int Minimum card collection frequency step size min_df_0: int Starting minimum deck collection frequency min_df_f: int Ending minimum deck collection frequency min_df_s: int Minimum deck collection frequency step size rm_top_0: int Starting number of top cards to exclude rm_top_f: int Ending number of top cards to exclude rm_top_s: int Top cards to exclude step size k0_0: int Starting number of initial topics k0_f: int Ending number of initial topics k0_s: int Number of initial topics step size alpha_0: int Starting number for the alpha hyperparameter as a power of ten, i.e. alpha = 10^(alpha_0) alpha_f: int Ending number for the alpha hyperparameter as a power of ten, i.e. alpha = 10^(alpha_f) alpha_s: int Step size for the powers of ten of the alpha hyperparameter eta_0: int Starting number for the eta hyperparameter as a power of ten, i.e. eta = 10^(eta_0) eta_f: int Ending number for the eta hyperparameter as a power of ten, i.e. eta = 10^(eta_f) eta_s: int Step size for the powers of ten of the eta hyperparameter gamma_0: int Starting number for the gamma hyperparameter as a power of ten, i.e. gamma = 10^(gamma_0) gamma_f: int Ending number for the gamma hyperparameter as a power of ten, i.e. gamma = 10^(gamma_f) gamma_s: int Step size for the powers of ten of the gamma hyperparameter seed: int Random seed. Set to 101 as default in an attempt to duplicate results; however, said duplication has proven to be... elusive. corpus: tomotopy Corpus A list of documents to be added into the model. Method will not function without model. burn: int Number of initial training iterations to discard the results of? train: int Number of iterations to train over word_list: list of lists of strings Collection of decklists with each card name represented as a string. card_count: int Number of cards used to evaluate card coherence. to_excel: boolean Output the resulting DataFrame to Excel spreadsheet? fname: string ending in '.xlsx' If to_excel == True, filename of the resulting Excel spreadsheet. :return: DataFrame that lists the results of the preceding iterations. Contains the following columns: k - number of topics (not all of which are live; not sure why this is relevant) Live k - number of topics that are actually viable Avg. LL - Average log likelihood per word (not really sure what this means, but I think that lower is better) LL Std. Dev. - Log Likelihood standard deviation LL CV - Log Likelihood coefficient of variance (Std. Dev./Average) Perplexity - Perplexity of the model (don't know what this means, but pretty sure that lower is better Coherence - (C_V) Coherence of the model. Shooting for ... 0.65? Or between 0.7 and 0.8? I'm honestly not sure """ results_lists = [[ 'tw', 'Min. f_col', 'Min. f_doc', 'Top n Terms Removed', 'Initial k', 'alpha', 'eta', 'gamma', 'k', 'Live k', 'Avg. LL', 'LL Std. Dev.', 'LL CV', 'Perplexity' ]] average_coherences = [] coh_std_dev = [] coh_cv = [] max_live_top = 0 for cf in range(min_cf_0, min_cf_f, min_cf_s): print("Collection Frequency = " + str(cf)) for df in range(min_df_0, min_df_f, min_df_s): print("Document Frequency = " + str(df)) for rm in range(rm_top_0, rm_top_f, rm_top_s): print("Remove Top " + str(rm) + " Words") for k in range(k0_0, k0_f, k0_s): print(str(k) + " Initial Topics") for a in range(alpha_0, alpha_f, alpha_s): print("alpha = " + str(10**a)) for e in range(eta_0, eta_f, eta_s): print("eta = " + str(10**e)) for g in range(gamma_0, gamma_f, gamma_s): print("gamma = " + str(10**g)) ll_list = [] hdp = tp.HDPModel(tw=tw, min_cf=cf, min_df=df, rm_top=rm, initial_k=k, alpha=10**a, eta=10**e, gamma=10**g, seed=seed, corpus=corpus) hdp.burn_in = burn hdp.train(0) for i in range(0, train, 100): hdp.train(100) ll_list.append(hdp.ll_per_word) hdp_mean = sum(ll_list) / len(ll_list) hdp_variance = sum([ ((x - hdp_mean)**2) for x in ll_list ]) / len(ll_list) hdp_std_dev = hdp_variance**0.5 hdp_cv = hdp_std_dev / hdp_mean hdp_topics = get_hdp_topics(hdp, card_count) # hdp_coh = eval_coherence(hdp_topics, word_list=word_list) results_list = [ str(tw), cf, df, rm, k, 10**a, 10**e, 10**g, hdp.k, hdp.live_k, hdp_mean, hdp_std_dev, hdp_cv, hdp.perplexity ] topic_coherences = eval_coherence_by_topic( hdp, deck_lists=word_list) results_list.extend(topic_coherences) average_coh = eval_coherence( hdp_topics, word_list) average_coherences.append(average_coh) coh_variance = sum([((x - average_coh)**2) for x in topic_coherences ]) / len(topic_coherences) coh_std_dev.append(coh_variance**2) coh_cv.append((coh_variance**2) / average_coh) results_lists.append(results_list) if hdp.live_k > max_live_top: max_live_top = hdp.live_k for num_top in range(0, max_live_top): results_lists[0].append('Top ' + str(num_top) + ' Coherence') df = pd.DataFrame(data=results_lists[1:], columns=results_lists[0]) df['Average Coherence'] = average_coherences df['Coherence Std Dev'] = coh_std_dev df['Coherence CV'] = coh_cv if to_excel: df.to_excel(fname, encoding='utf-8') return df
def create_hdp(tw=tp.TermWeight.IDF, min_cf=0, min_df=5, rm_top=0, initial_k=2, alpha=0.1, eta=1, gamma=1, seed=101, corpus=None): """ Creates a tomotopy HDPModel() Parameters: tw: Union[int, TermWeight] term weighting scheme in https://bab2min.github.io/tomotopy/v0.8.0/en/#tomotopy.TermWeight ; I chose the default to be inverse document frequency, which means that cards that appear in almost all decks are weighted lower than cards that appear in very few decks. min_cf: int Unless I'm mistaken, this is the minimum number of times that a card must appear at all in any deck to be included. However, since the vast majority of cards can be included at most once, this is almost always going to be the same as min_df. min_df: int Minimum number of times that a card must appear in a deck to be included in the analysis; default is set to 5. rm_top: int When ranking the most popular cards that are included in a given commander's decks, this parameter will remove the top n of them. Default is 0. initial_k: int Number of themes/archetypes that you THINK this commander's decks can be sorted into. This number does not dictate, per se, how many themes will be identified once the analysis is over. Perhaps a good place to start would be with how many themes have been currently identified for this commander already on EDHREC? The default value is 2. alpha: float "concentration coe[f]ficient of Dirichlet Process for document-table". Increasing alpha ... Based on advice from Eduardo Coronado (@ecoronado92 on Twitter), default for alpha is set to 0.1. eta: float "hyperparameter of Dirichlet distribution for topic-word". Increasing eta ... Based on experimentation, default for eta is 1. gamma: float "concentration coef[f]icient of Dirichlet [p]rocess for table-topic". Sets the overall number of themes/archetypes that the decks can share. Increasing gamma increases the number of themes that can be identified. Based on advice from Eduardo Coronado (@ecoronado92 on Twitter), default for alpha is set to 1. seed: int Random seed. Set to 101 as default in an attempt to duplicate results; however, said duplication has proven to be... elusive. corpus: tomotopy Corpus A list of documents to be added into the model. If None, documents have to be added after the model is created through HDPModel.add_doc() before the model can be trained. :return: tomotopy HDP model object """ hdp = tp.HDPModel(tw=tw, min_cf=min_cf, min_df=min_df, rm_top=rm_top, initial_k=initial_k, alpha=alpha, eta=eta, gamma=gamma, seed=seed, corpus=corpus) return hdp
def main(): """ """ ## Establish Model Directory if not os.path.exists(MODEL_DIR): _ = os.makedirs(MODEL_DIR) ## Filenames submission_filenames = sorted( glob(f"{DATA_DIR}raw/AskDocs/submissions/*.json.gz")) ## Try To Load Phrasers, Or Learn Them as Fallback phrasers, ngrams = learn_phrasers(submission_filenames, verbose=False, model_dir=MODEL_DIR) ## Get Vectorized Representation X, post_ids, vocabulary = vectorize_data(filenames=submission_filenames, phrasers=phrasers, ngrams=ngrams, verbose=False, model_dir=MODEL_DIR) id2word = dict(zip(range(X.shape[1]), vocabulary)) ## Drop Examples Without Vocabulary sample_mask = np.nonzero(X.getnnz(axis=1) > 0)[0] X = X[sample_mask] post_ids = [post_ids[sm] for sm in sample_mask] ## Transform Matrix to Vocabulary corpus = generate_corpus(X, vocabulary) ## Initialize Model model = tp.HDPModel(corpus=corpus, initial_k=INITIAL_K, alpha=ALPHA_PRIOR, eta=ETA_PRIOR, gamma=GAMMA_PRIOR, seed=RANDOM_SEED) ## Fit Model using Gibbs Sampler print("Beginning Model Training") params = np.zeros((MODEL_N_ITER, 5)) for iteration in tqdm(range(MODEL_N_ITER), total=MODEL_N_ITER, desc="MCMC", file=sys.stdout): model.train(1, workers=NUM_JOBS) params[iteration] = np.array([ model.k, model.live_k, model.alpha, model.gamma, model.ll_per_word ]) live_topics = [i for i in range(model.k) if model.is_live_topic(i)] params = pd.DataFrame(params, columns=["k", "live_k", "alpha", "gamma", "ll"]) params.to_csv(f"{MODEL_DIR}/hdp.mcmc.csv", index=False) ## Trace Plots fig, ax = plt.subplots(2, 2, figsize=(10, 5.6), sharex=True) ax[0, 0].plot(params["k"], label="K", color="C0", alpha=0.8) ax[0, 0].plot(params["live_k"], label="Active K", color="C1", alpha=0.8) ax[0, 1].plot(params["alpha"], color="C2", alpha=0.8) ax[1, 0].plot(params["gamma"], color="C3", alpha=0.8) ax[1, 1].plot(params["ll"], color="black", alpha=0.8) ax[0, 0].legend(loc="lower right", fontsize=13) for i in range(2): ax[1, i].set_xlabel("MCMC Iteration") for j in range(2): ax[i, j].spines["right"].set_visible(False) ax[i, j].spines["top"].set_visible(False) ax[i, j].tick_params(labelsize=10) ax[0, 0].set_title("# Components") ax[0, 1].set_title("$\\alpha$") ax[1, 0].set_title("$\\gamma$") ax[1, 1].set_title("Log-Likelihood") fig.tight_layout() fig.savefig(f"{MODEL_DIR}trace.png", dpi=300) plt.close(fig) ## Save the model _ = model.save(f"{MODEL_DIR}/hdp.model") _ = model.summary(file=open(f"{MODEL_DIR}/hdp.summary.txt", "w"), topic_word_top_n=20) with open(f"{MODEL_DIR}/hdp.summary.txt", "a") as the_file: the_file.write("Live Topics: {}".format(", ".join( list(map(str, live_topics))))) ## Extract Topic Distribution print("Caching Topic Distribution") topic_word_dist = np.vstack( [model.get_topic_word_dist(topic) for topic in live_topics]) topic_terms = [] for i, dist in enumerate(topic_word_dist): dist_sorting = np.argsort(dist)[::-1][:CACHE_TOP_K] dist_sorting_vocab = [[model.used_vocabs[d], float(dist[d])] for d in dist_sorting] topic_terms.append({"topic": i, "terms": dist_sorting_vocab}) with open(f"{MODEL_DIR}topic_terms.json", "w") as the_file: for tt in topic_terms: _ = the_file.write(f"{json.dumps(tt)}\n") ## Topic Assignments print("Caching Topic Assignments") doc_topic_dist = np.vstack([doc.get_topic_dist() for doc in model.docs])[:, live_topics] topic_assignments = [] for post_id, doc in zip(post_ids, doc_topic_dist): doc_topics = [[int(d), float(doc[d])] for d in doc.nonzero()[0]] topic_assignments.append({"id": post_id, "topics": doc_topics}) with open(f"{MODEL_DIR}topic_assignments.json", "w") as the_file: for ta in topic_assignments: _ = the_file.write(f"{json.dumps(ta)}\n") print("Script Complete.")
import tomotopy as tp model = tp.HDPModel() print(model.alpha) print(model.eta) print(model.gamma) print(model.live_k) print(model.num_tables)
import sys import tomotopy as tp import pickle from gensim import models, corpora from gensim.models import CoherenceModel print(tp.isa) # prints 'avx2', 'avx', 'sse2' or 'none' with open('outfile', 'rb') as fp: text_list = pickle.load(fp) hdp = tp.HDPModel(seed=1000, tw=tp.TermWeight.IDF, initial_k=10, ) for i in text_list: hdp.add_doc(words=i) # Initiate sampling burn-in (i.e. discard N first iterations) hdp.burn_in = 10000 hdp.train(0) print('Num docs:', len(hdp.docs), ', Vocab size:', hdp.num_vocabs, ', Num words:', hdp.num_words) print('Removed top words:', hdp.removed_top_words) # Train model for i in range(0, 1000, 100): hdp.train(100) # 100 iterations at a time print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format(i, hdp.ll_per_word, hdp.live_k)) hdp.save("unigram_hdp_model.bin")
!pip install tomotopy import tomotopy as tp # Train HDP model. mdl = tp.HDPModel(min_cf=0, min_df=0, rm_top=0, initial_k=30, alpha=1, eta=0.01, gamma=1) for line in open('bigrammed.txt'): mdl.add_doc(line.strip().split()) for i in range(0, 1000, 10): mdl.train(10) print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word)) for k in range(mdl.k): print('Top 10 words of topic #{}'.format(k)) print(mdl.get_topic_words(k, top_n=10)) mdl.summary()