Esempio n. 1
0
def test_empty_uid():
    cps = tp.utils.Corpus()
    cps.add_doc("test text".split())
    cps.add_doc("test text".split())
    cps.add_doc("test text".split())

    mdl = tp.HDPModel(corpus=cps)
    assert len(cps) == len(mdl.docs)
    assert cps[0].uid == mdl.docs[0].uid
    mdl.train(0)

    mdl = tp.HDPModel()
    ccps = mdl.add_corpus(cps)
    mdl.add_corpus(ccps)
Esempio n. 2
0
def test_hdp_to_lda():
    mdl = tp.HDPModel(tw=tp.TermWeight.ONE,
                      min_df=5,
                      rm_top=5,
                      alpha=0.5,
                      gamma=0.5,
                      initial_k=5)
    for n, line in enumerate(open('test/sample.txt', encoding='utf-8')):
        ch = line.strip().split()
        mdl.add_doc(ch)
    mdl.burn_in = 100
    mdl.train(0)
    print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs,
          ', Num words:', mdl.num_words)
    print('Removed top words:', mdl.removed_top_words)
    for i in range(0, 1000, 10):
        mdl.train(10)
        print(
            'Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}\tNum. of tables: {}'
            .format(i, mdl.ll_per_word, mdl.live_k, mdl.num_tables))

    lda, topic_mapping = mdl.convert_to_lda(topic_threshold=1e-3)
    print(topic_mapping)
    for i in range(0, 100, 10):
        lda.train(10)
        print('Iteration: {}\tLog-likelihood: {}'.format(i, lda.ll_per_word))

    for k in range(lda.k):
        print('Topic #{} ({})'.format(k, lda.get_count_by_topics()[k]))
        for word, prob in lda.get_topic_words(k):
            print('\t', word, prob, sep='\t')
Esempio n. 3
0
def hdp_example(input_file, save_path):
    mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5)
    for n, line in enumerate(open(input_file, encoding='utf-8')):
        ch = line.strip().split()
        mdl.add_doc(ch)
    mdl.burn_in = 100
    mdl.train(0)
    print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs,
          ', Num words:', mdl.num_words)
    print('Removed top words:', mdl.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    for i in range(0, 1000, 10):
        mdl.train(10)
        print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format(
            i, mdl.ll_per_word, mdl.live_k))

    print('Saving...', file=sys.stderr, flush=True)
    mdl.save(save_path, True)

    important_topics = [
        k for k, v in sorted(enumerate(mdl.get_count_by_topics()),
                             key=lambda x: x[1],
                             reverse=True)
    ]
    for k in important_topics:
        if not mdl.is_live_topic(k): continue
        print('Topic #{}'.format(k))
        for word, prob in mdl.get_topic_words(k):
            print('\t', word, prob, sep='\t')
Esempio n. 4
0
def train_hdp(corpus,
              initial_k=10,
              term_weight=tp.TermWeight.PMI,
              gamma=1,
              alpha=0.1,
              iterations=2000):
    """
    Train a heirarchical dirichlet process topic model
    """

    hdp = tp.HDPModel(tw=term_weight,
                      gamma=1,
                      alpha=0.1,
                      initial_k=initial_k,
                      seed=1000)

    # add samples to model
    random.seed(1000)
    random.shuffle(corpus)
    for c in corpus:
        hdp.add_doc(c)

    # discard the first N samples
    hdp.burn_in = 1000
    hdp.train(0)

    for i in range(0, iterations + 1, 100):
        hdp.train(100)
        print(
            f'{i = }\tlog-likelihood = {hdp.ll_per_word}\ttopics = {hdp.live_k}'
        )

    return hdp
Esempio n. 5
0
def tp_one_trial(dataset, model_type, topic_size, sample_size, min_cf=3, rm_top=5,
             max_iter=1000, min_iter=None, checkpoint=None, stop_increase=1, metric='ll'):
    assert model_type in ['lda', 'ctm', 'slda', 'hdp'], f'invalid `model_type`: {model_type}...'
    assert metric in ['ll', 'pp'], f'invalid `metric`: {metric}...'
    if model_type == 'lda':
        model = tp.LDAModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    if model_type == 'ctm':
        model = tp.CTModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    if model_type == "slda":
        model = tp.SLDAModel(k=topic_size,vars="b", tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    if model_type == 'hdp':
        model = tp.HDPModel(initial_k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    sample_size = min(sample_size, len(dataset))
    
#     max_iter = max_iter * sample_size * topic_size // 2000  # ensure the number of iterations increases with the size of sample
    model.burn_in = max_iter // 5  # set burn-in: 20 percent of max iterations

    for i in range(sample_size):
        doc, label = dataset[i]
        if model_type == "slda":
            model.add_doc(doc,[float(label),])
        else:
            model.add_doc(doc)

    if min_iter is None:
        min_iter = max_iter // 5
    if checkpoint is None:
        checkpoint = max_iter // 5

    model.train(min_iter)

    pre_metric = - np.infty
    stop_increase_cnt = 0.
    cur_metric = 0.
    for i in range(1, max_iter+1):
        model.train(1)
        # Metric is always larger, better
        if metric == 'll':
            cur_metric += model.ll_per_word
        if metric == 'pp':
            cur_metric += - model.perplexity  # smaller perplexity is better.

        if i % checkpoint == 0:
            cur_metric /= checkpoint
            print(f'Current loss: {cur_metric:.5f}')
            if cur_metric >= pre_metric:
                pre_metric = cur_metric
            else:
                stop_increase_cnt += 1
            cur_metric = 0.

        if stop_increase_cnt >= stop_increase:
            break

    final_metric = model.perplexity if metric == 'pp' else model.ll_per_word

    print(f'Trial iterations: {i + min_iter}.')
    return model, final_metric
Esempio n. 6
0
    def hdp_model(self, text_data, save_path):
        mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5)
        index = 0
        for doc in text_data:
            print(str(index) + " : " + str(doc))
            mdl.add_doc(doc)
            index += 1

        mdl.burn_in = 100
        mdl.train(0)
        print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words)
        print('Removed top words:', mdl.removed_top_words)
        print('Training...', file=sys.stderr, flush=True)
        for i in range(0, 1000, 10):
            mdl.train(10)
            print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format(i, mdl.ll_per_word, mdl.live_k))

        print('Saving...', file=sys.stderr, flush=True)
        mdl.save(save_path, True)

        topic_num = 0
        # extract candidates for auto topic labeling
        extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
        cands = extractor.extract(mdl)

        # ranking the candidates of labels for a specific topic
        labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
        important_topics = [k for k, v in
                            sorted(enumerate(mdl.get_count_by_topics()), key=lambda x: x[1], reverse=True)]
        for k in important_topics:
            if not mdl.is_live_topic(k): continue
            print("== Topic #{} ==".format(k))
            print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
            for word, prob in mdl.get_topic_words(k, top_n=10):
                print(word, prob, sep='\t')
            print()
            topic_num+=1
        return (mdl, topic_num)
Esempio n. 7
0
def hdp_param_checker(tw=tp.TermWeight.IDF,
                      min_cf_0=0,
                      min_cf_f=1,
                      min_cf_s=1,
                      min_df_0=0,
                      min_df_f=1,
                      min_df_s=1,
                      rm_top_0=0,
                      rm_top_f=1,
                      rm_top_s=1,
                      k0_0=2,
                      k0_f=12,
                      k0_s=3,
                      alpha_0=-1,
                      alpha_f=0,
                      alpha_s=1,
                      eta_0=0,
                      eta_f=1,
                      eta_s=1,
                      gamma_0=0,
                      gamma_f=1,
                      gamma_s=1,
                      seed=101,
                      corpus=None,
                      burn=100,
                      train=1001,
                      word_list=None,
                      card_count=30,
                      to_excel=False,
                      fname='param_checking.xlsx'):
    """
    Method to automatically iterate through different HDP parameters to compare results
    Parameters
        tw: Union[int, TermWeight]
            term weighting scheme in https://bab2min.github.io/tomotopy/v0.8.0/en/#tomotopy.TermWeight ;
            I chose the default to be inverse document frequency, which means that cards that appear in
            almost all decks are weighted lower than cards that appear in very few decks.
        min_cf_0: int
            Starting minimum card collection frequency
        min_cf_f: int
            Ending minimum card collection frequency
        min_cf_s: int
            Minimum card collection frequency step size
        min_df_0: int
            Starting minimum deck collection frequency
        min_df_f: int
            Ending minimum deck collection frequency
        min_df_s: int
            Minimum deck collection frequency step size
        rm_top_0: int
            Starting number of top cards to exclude
        rm_top_f: int
            Ending number of top cards to exclude
        rm_top_s: int
            Top cards to exclude step size
        k0_0: int
            Starting number of initial topics
        k0_f: int
            Ending number of initial topics
        k0_s: int
            Number of initial topics step size
        alpha_0: int
            Starting number for the alpha hyperparameter as a power of ten, i.e. alpha = 10^(alpha_0)
        alpha_f: int
            Ending number for the alpha hyperparameter as a power of ten, i.e. alpha = 10^(alpha_f)
        alpha_s: int
            Step size for the powers of ten of the alpha hyperparameter
        eta_0: int
            Starting number for the eta hyperparameter as a power of ten, i.e. eta = 10^(eta_0)
        eta_f: int
            Ending number for the eta hyperparameter as a power of ten, i.e. eta = 10^(eta_f)
        eta_s: int
            Step size for the powers of ten of the eta hyperparameter
        gamma_0: int
            Starting number for the gamma hyperparameter as a power of ten, i.e. gamma = 10^(gamma_0)
        gamma_f: int
            Ending number for the gamma hyperparameter as a power of ten, i.e. gamma = 10^(gamma_f)
        gamma_s: int
            Step size for the powers of ten of the gamma hyperparameter
        seed: int
            Random seed. Set to 101 as default in an attempt to duplicate results; however, said
            duplication has proven to be... elusive.
        corpus: tomotopy Corpus
            A list of documents to be added into the model. Method will not function without model.
        burn: int
            Number of initial training iterations to discard the results of?
        train: int
            Number of iterations to train over
        word_list: list of lists of strings
            Collection of decklists with each card name represented as a string.
        card_count: int
            Number of cards used to evaluate card coherence.
        to_excel: boolean
            Output the resulting DataFrame to Excel spreadsheet?
        fname: string ending in '.xlsx'
            If to_excel == True, filename of the resulting Excel spreadsheet.
    :return:
        DataFrame that lists the results of the preceding iterations. Contains the following columns:
            k - number of topics (not all of which are live; not sure why this is relevant)
            Live k - number of topics that are actually viable
            Avg. LL - Average log likelihood per word (not really sure what this means,
                but I think that lower is better)
            LL Std. Dev. - Log Likelihood standard deviation
            LL CV - Log Likelihood coefficient of variance (Std. Dev./Average)
            Perplexity - Perplexity of the model (don't know what this means,
                but pretty sure that lower is better
            Coherence - (C_V) Coherence of the model. Shooting for ... 0.65? Or between
                0.7 and 0.8? I'm honestly not sure
    """

    results_lists = [[
        'tw', 'Min. f_col', 'Min. f_doc', 'Top n Terms Removed', 'Initial k',
        'alpha', 'eta', 'gamma', 'k', 'Live k', 'Avg. LL', 'LL Std. Dev.',
        'LL CV', 'Perplexity'
    ]]
    average_coherences = []
    coh_std_dev = []
    coh_cv = []
    max_live_top = 0
    for cf in range(min_cf_0, min_cf_f, min_cf_s):
        print("Collection Frequency = " + str(cf))
        for df in range(min_df_0, min_df_f, min_df_s):
            print("Document Frequency = " + str(df))
            for rm in range(rm_top_0, rm_top_f, rm_top_s):
                print("Remove Top " + str(rm) + " Words")
                for k in range(k0_0, k0_f, k0_s):
                    print(str(k) + " Initial Topics")
                    for a in range(alpha_0, alpha_f, alpha_s):
                        print("alpha = " + str(10**a))
                        for e in range(eta_0, eta_f, eta_s):
                            print("eta = " + str(10**e))
                            for g in range(gamma_0, gamma_f, gamma_s):
                                print("gamma = " + str(10**g))
                                ll_list = []
                                hdp = tp.HDPModel(tw=tw,
                                                  min_cf=cf,
                                                  min_df=df,
                                                  rm_top=rm,
                                                  initial_k=k,
                                                  alpha=10**a,
                                                  eta=10**e,
                                                  gamma=10**g,
                                                  seed=seed,
                                                  corpus=corpus)
                                hdp.burn_in = burn
                                hdp.train(0)
                                for i in range(0, train, 100):
                                    hdp.train(100)
                                    ll_list.append(hdp.ll_per_word)
                                hdp_mean = sum(ll_list) / len(ll_list)
                                hdp_variance = sum([
                                    ((x - hdp_mean)**2) for x in ll_list
                                ]) / len(ll_list)
                                hdp_std_dev = hdp_variance**0.5
                                hdp_cv = hdp_std_dev / hdp_mean
                                hdp_topics = get_hdp_topics(hdp, card_count)
                                # hdp_coh = eval_coherence(hdp_topics, word_list=word_list)
                                results_list = [
                                    str(tw), cf, df, rm, k, 10**a, 10**e,
                                    10**g, hdp.k, hdp.live_k, hdp_mean,
                                    hdp_std_dev, hdp_cv, hdp.perplexity
                                ]
                                topic_coherences = eval_coherence_by_topic(
                                    hdp, deck_lists=word_list)
                                results_list.extend(topic_coherences)
                                average_coh = eval_coherence(
                                    hdp_topics, word_list)
                                average_coherences.append(average_coh)
                                coh_variance = sum([((x - average_coh)**2)
                                                    for x in topic_coherences
                                                    ]) / len(topic_coherences)
                                coh_std_dev.append(coh_variance**2)
                                coh_cv.append((coh_variance**2) / average_coh)
                                results_lists.append(results_list)
                                if hdp.live_k > max_live_top:
                                    max_live_top = hdp.live_k
    for num_top in range(0, max_live_top):
        results_lists[0].append('Top ' + str(num_top) + ' Coherence')
    df = pd.DataFrame(data=results_lists[1:], columns=results_lists[0])
    df['Average Coherence'] = average_coherences
    df['Coherence Std Dev'] = coh_std_dev
    df['Coherence CV'] = coh_cv
    if to_excel:
        df.to_excel(fname, encoding='utf-8')
    return df
Esempio n. 8
0
def create_hdp(tw=tp.TermWeight.IDF,
               min_cf=0,
               min_df=5,
               rm_top=0,
               initial_k=2,
               alpha=0.1,
               eta=1,
               gamma=1,
               seed=101,
               corpus=None):
    """
    Creates a tomotopy HDPModel()
    Parameters:
        tw: Union[int, TermWeight]
            term weighting scheme in https://bab2min.github.io/tomotopy/v0.8.0/en/#tomotopy.TermWeight ;
            I chose the default to be inverse document frequency, which means that cards that appear in
            almost all decks are weighted lower than cards that appear in very few decks.
        min_cf: int
            Unless I'm mistaken, this is the minimum number of times that a card must appear at all in
            any deck to be included. However, since the vast majority of cards can be included at most
            once, this is almost always going to be the same as min_df.
        min_df: int
            Minimum number of times that a card must appear in a deck to be included in the analysis;
            default is set to 5.
        rm_top: int
            When ranking the most popular cards that are included in a given commander's decks, this
            parameter will remove the top n of them. Default is 0.
        initial_k: int
            Number of themes/archetypes that you THINK this commander's decks can be sorted into.
            This number does not dictate, per se, how many themes will be identified once the analysis
            is over. Perhaps a good place to start would be with how many themes have been currently
            identified for this commander already on EDHREC? The default value is 2.
        alpha: float
            "concentration coe[f]ficient of Dirichlet Process for document-table". Increasing alpha ... Based
            on advice from Eduardo Coronado (@ecoronado92 on Twitter), default for alpha is set to 0.1.
        eta: float
            "hyperparameter of Dirichlet distribution for topic-word". Increasing eta ... Based on
            experimentation, default for eta is 1.
        gamma: float
            "concentration coef[f]icient of Dirichlet [p]rocess for table-topic". Sets the overall number
            of themes/archetypes that the decks can share. Increasing gamma increases the number of
            themes that can be identified. Based on advice from Eduardo Coronado (@ecoronado92 on
            Twitter), default for alpha is set to 1.
        seed: int
            Random seed. Set to 101 as default in an attempt to duplicate results; however, said
            duplication has proven to be... elusive.
        corpus: tomotopy Corpus
            A list of documents to be added into the model. If None, documents have to be added
            after the model is created through HDPModel.add_doc() before the model can be trained.

    :return:
        tomotopy HDP model object
    """

    hdp = tp.HDPModel(tw=tw,
                      min_cf=min_cf,
                      min_df=min_df,
                      rm_top=rm_top,
                      initial_k=initial_k,
                      alpha=alpha,
                      eta=eta,
                      gamma=gamma,
                      seed=seed,
                      corpus=corpus)
    return hdp
def main():
    """

    """
    ## Establish Model Directory
    if not os.path.exists(MODEL_DIR):
        _ = os.makedirs(MODEL_DIR)
    ## Filenames
    submission_filenames = sorted(
        glob(f"{DATA_DIR}raw/AskDocs/submissions/*.json.gz"))
    ## Try To Load Phrasers, Or Learn Them as Fallback
    phrasers, ngrams = learn_phrasers(submission_filenames,
                                      verbose=False,
                                      model_dir=MODEL_DIR)
    ## Get Vectorized Representation
    X, post_ids, vocabulary = vectorize_data(filenames=submission_filenames,
                                             phrasers=phrasers,
                                             ngrams=ngrams,
                                             verbose=False,
                                             model_dir=MODEL_DIR)
    id2word = dict(zip(range(X.shape[1]), vocabulary))
    ## Drop Examples Without Vocabulary
    sample_mask = np.nonzero(X.getnnz(axis=1) > 0)[0]
    X = X[sample_mask]
    post_ids = [post_ids[sm] for sm in sample_mask]
    ## Transform Matrix to Vocabulary
    corpus = generate_corpus(X, vocabulary)
    ## Initialize Model
    model = tp.HDPModel(corpus=corpus,
                        initial_k=INITIAL_K,
                        alpha=ALPHA_PRIOR,
                        eta=ETA_PRIOR,
                        gamma=GAMMA_PRIOR,
                        seed=RANDOM_SEED)
    ## Fit Model using Gibbs Sampler
    print("Beginning Model Training")
    params = np.zeros((MODEL_N_ITER, 5))
    for iteration in tqdm(range(MODEL_N_ITER),
                          total=MODEL_N_ITER,
                          desc="MCMC",
                          file=sys.stdout):
        model.train(1, workers=NUM_JOBS)
        params[iteration] = np.array([
            model.k, model.live_k, model.alpha, model.gamma, model.ll_per_word
        ])
    live_topics = [i for i in range(model.k) if model.is_live_topic(i)]
    params = pd.DataFrame(params,
                          columns=["k", "live_k", "alpha", "gamma", "ll"])
    params.to_csv(f"{MODEL_DIR}/hdp.mcmc.csv", index=False)
    ## Trace Plots
    fig, ax = plt.subplots(2, 2, figsize=(10, 5.6), sharex=True)
    ax[0, 0].plot(params["k"], label="K", color="C0", alpha=0.8)
    ax[0, 0].plot(params["live_k"], label="Active K", color="C1", alpha=0.8)
    ax[0, 1].plot(params["alpha"], color="C2", alpha=0.8)
    ax[1, 0].plot(params["gamma"], color="C3", alpha=0.8)
    ax[1, 1].plot(params["ll"], color="black", alpha=0.8)
    ax[0, 0].legend(loc="lower right", fontsize=13)
    for i in range(2):
        ax[1, i].set_xlabel("MCMC Iteration")
        for j in range(2):
            ax[i, j].spines["right"].set_visible(False)
            ax[i, j].spines["top"].set_visible(False)
            ax[i, j].tick_params(labelsize=10)
    ax[0, 0].set_title("# Components")
    ax[0, 1].set_title("$\\alpha$")
    ax[1, 0].set_title("$\\gamma$")
    ax[1, 1].set_title("Log-Likelihood")
    fig.tight_layout()
    fig.savefig(f"{MODEL_DIR}trace.png", dpi=300)
    plt.close(fig)
    ## Save the model
    _ = model.save(f"{MODEL_DIR}/hdp.model")
    _ = model.summary(file=open(f"{MODEL_DIR}/hdp.summary.txt", "w"),
                      topic_word_top_n=20)
    with open(f"{MODEL_DIR}/hdp.summary.txt", "a") as the_file:
        the_file.write("Live Topics: {}".format(", ".join(
            list(map(str, live_topics)))))
    ## Extract Topic Distribution
    print("Caching Topic Distribution")
    topic_word_dist = np.vstack(
        [model.get_topic_word_dist(topic) for topic in live_topics])
    topic_terms = []
    for i, dist in enumerate(topic_word_dist):
        dist_sorting = np.argsort(dist)[::-1][:CACHE_TOP_K]
        dist_sorting_vocab = [[model.used_vocabs[d],
                               float(dist[d])] for d in dist_sorting]
        topic_terms.append({"topic": i, "terms": dist_sorting_vocab})
    with open(f"{MODEL_DIR}topic_terms.json", "w") as the_file:
        for tt in topic_terms:
            _ = the_file.write(f"{json.dumps(tt)}\n")
    ## Topic Assignments
    print("Caching Topic Assignments")
    doc_topic_dist = np.vstack([doc.get_topic_dist()
                                for doc in model.docs])[:, live_topics]
    topic_assignments = []
    for post_id, doc in zip(post_ids, doc_topic_dist):
        doc_topics = [[int(d), float(doc[d])] for d in doc.nonzero()[0]]
        topic_assignments.append({"id": post_id, "topics": doc_topics})
    with open(f"{MODEL_DIR}topic_assignments.json", "w") as the_file:
        for ta in topic_assignments:
            _ = the_file.write(f"{json.dumps(ta)}\n")
    print("Script Complete.")
Esempio n. 10
0
File: hdp.py Progetto: ankane/tomoto
import tomotopy as tp

model = tp.HDPModel()
print(model.alpha)
print(model.eta)
print(model.gamma)
print(model.live_k)
print(model.num_tables)
Esempio n. 11
0
import sys

import tomotopy as tp
import pickle
from gensim import models, corpora
from gensim.models import CoherenceModel

print(tp.isa)  # prints 'avx2', 'avx', 'sse2' or 'none'


with open('outfile', 'rb') as fp:
    text_list = pickle.load(fp)

hdp = tp.HDPModel(seed=1000, tw=tp.TermWeight.IDF, initial_k=10, )
for i in text_list:
    hdp.add_doc(words=i)


# Initiate sampling burn-in  (i.e. discard N first iterations)
hdp.burn_in = 10000
hdp.train(0)
print('Num docs:', len(hdp.docs), ', Vocab size:', hdp.num_vocabs,
      ', Num words:', hdp.num_words)
print('Removed top words:', hdp.removed_top_words)


# Train model
for i in range(0, 1000, 100):
    hdp.train(100)  # 100 iterations at a time
    print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format(i, hdp.ll_per_word, hdp.live_k))
    hdp.save("unigram_hdp_model.bin")
Esempio n. 12
0
!pip install tomotopy
import tomotopy as tp

# Train HDP model.

mdl = tp.HDPModel(min_cf=0, min_df=0, rm_top=0, initial_k=30, alpha=1, eta=0.01, gamma=1)
for line in open('bigrammed.txt'):
    mdl.add_doc(line.strip().split())

for i in range(0, 1000, 10):
    mdl.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=10))

mdl.summary()