def transmit_topic_model(t_token, t_model, t_k, t_min_tf, t_max_tf, t_lang):
    TOM_Corpus.MAX_FEATURES = 5000
    vectorization = ''
    if t_model == 'LDA':
        vectorization = 'tf'
    elif t_model == 'NMF':
        vectorization = 'tfidf'
    corpus = TOM_Corpus(source_file_path='csv/' + t_token + '.csv',
                        vectorization=vectorization,
                        max_relative_frequency=t_max_tf,
                        min_absolute_frequency=t_min_tf,
                        language=t_lang,
                        preprocessor=None)
    global topic_model
    if t_model == 'LDA':
        topic_model = LatentDirichletAllocation(corpus)
    elif t_model == 'NMF':
        topic_model = NonNegativeMatrixFactorization(corpus)
    if topic_model is not None:
        if t_k is None:
            t_k = 10
        t_k = int(t_k)
        topic_model.infer_topics(t_k)
        result_data = {
            'token':
            t_token,
            'result':
            '<a href="http://mediamining.univ-lyon2.fr/people/guille/cats/tom/'
            + t_token +
            '/topic_cloud.html" target="_blank">Open the topic model browser in a new window</a>'
        }
        json_data = json.dumps(result_data)
        results_request = urllib2.Request(
            'http://mediamining.univ-lyon2.fr/cats/module/result')
        results_request.add_header('Content-Type', 'application/json')
        results_request.data = json_data.encode('utf-8')
        urllib2.urlopen(results_request)
        print('Transmitted topic model for token ' + t_token)
        prepare_topic_model_browser()
        freeze_topic_model_browser()
        prepare_topic_model_browser()
        os.remove('csv/' + t_token + '.csv')
Esempio n. 2
0
def buildTopicModel(className, startTime, endTime):
    print("Building Topic Model in build_topic_model.py")
    # Parameters
    max_tf = 0.8
    min_tf = 4
    num_topics = 7
    vectorization = 'tfidf'

    MYDIR = os.path.dirname(__file__)

    # Load corpus
    corpus = getCorpus(className, startTime, endTime)
    print('corpus size:', corpus.size)
    print('vocabulary size:', len(corpus.vocabulary))

    # Infer topics
    topic_model = NonNegativeMatrixFactorization(corpus=corpus)
    topic_model.infer_topics(num_topics=num_topics)
    topic_model.print_topics(num_words=10)

    # Save the topic model for reference
    # We'll just use a placeholder path for now
    utils.save_topic_model(topic_model, os.path.join(MYDIR, getTopicModelPath(className)))
max_tf = 0.8
min_tf = 4
num_topics = 15
vectorization = 'tfidf'

# Load corpus
corpus = Corpus(source_file_path='input/egc_lemmatized.csv',
                language='french',
                vectorization=vectorization,
                max_relative_frequency=max_tf,
                min_absolute_frequency=min_tf)
print('corpus size:', corpus.size)
print('vocabulary size:', len(corpus.vocabulary))

# Infer topics
topic_model = NonNegativeMatrixFactorization(corpus=corpus)
topic_model.infer_topics(num_topics=num_topics)
topic_model.print_topics(num_words=10)

# Clean the data directory
if os.path.exists('browser/static/data'):
    shutil.rmtree('browser/static/data')
os.makedirs('browser/static/data')

# Export topic cloud
utils.save_topic_cloud(topic_model, 'browser/static/data/topic_cloud.json')

# Export details about topics
for topic_id in range(topic_model.nb_topics):
    utils.save_word_distribution(topic_model.top_words(topic_id, 20),
                                 'browser/static/data/word_distribution' + str(topic_id) + '.tsv')
Esempio n. 4
0
# Load and prepare a corpus
print('Load documents from CSV')
corpus = Corpus(
    source_file_path='input/egc_lemmatized.csv',
    language='french',  # language for stop words
    vectorization=
    'tfidf',  # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency)
    max_relative_frequency=
    0.8,  # ignore words which relative frequency is > than max_relative_frequency
    min_absolute_frequency=4
)  # ignore words which absolute frequency is < than min_absolute_frequency
print('corpus size:', corpus.size)
print('vocabulary size:', len(corpus.vocabulary))

# Instantiate a topic model
topic_model = NonNegativeMatrixFactorization(corpus)

# Estimate the optimal number of topics
# print('Estimating the number of topics...')
# viz = Visualization(topic_model)
# viz.plot_greene_metric(min_num_topics=10,
#                        max_num_topics=11,
#                        tao=10, step=1,
#                        top_n_words=10)
# viz.plot_arun_metric(min_num_topics=5,
#                      max_num_topics=30,
#                      iterations=10)
# viz.plot_brunet_metric(min_num_topics=5,
#                        max_num_topics=30,
#                        iterations=10)
Esempio n. 5
0
    source_file_path='Papers.csv',
    language='english',  # language for stop words
    vectorization=
    'tfidf',  # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency)
    n_gram=3,
    max_relative_frequency=
    0.8,  # ignore words which relative frequency is > than max_relative_frequency
    min_absolute_frequency=4
)  # ignore words which absolute frequency is < than min_absolute_frequency

print('corpus size:', corpus.size)
print('vocabulary size:', len(corpus.vocabulary))
#print('Vector representation of document 0:\n', corpus.vector_for_document(0))
# Instantiate a topic model
print('Instantiate a topic model...')
topic_model = NonNegativeMatrixFactorization(corpus)
topic_model.infer_topics(num_topics)
ut.save_topic_model(topic_model, 'output/NMF_30topics.tom')

print('Finding global Topics...')
print(
    'Writing GlobalTopics with the name we assigned them plus topicWords:.....'
)
print('The name of the file is "GlobalTopicsWithName.csv"')
MakeGlobalTopics(topic_model)
print('Writing Topics with their related PaperID:.....')
print('The name of the file is "TopicIDPaperID.csv"')
WritrTopicsWithPaperID()
print('wrting topics per year...')
WriteTopicPerYear(corpus, topic_model)
print('Creating file for visualization called "PlotCsv.csv"')
Esempio n. 6
0
__author__ = "Adrien Guille, Pavel Soriano"
__email__ = "*****@*****.**"

# Load and prepare a corpus
print('Load documents from CSV')
corpus = Corpus(source_file_path='input/egc_lemmatized.csv',
                language='french',  # language for stop words
                vectorization='tfidf',  # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency)
                max_relative_frequency=0.8,  # ignore words which relative frequency is > than max_relative_frequency
                min_absolute_frequency=4)  # ignore words which absolute frequency is < than min_absolute_frequency
print('corpus size:', corpus.size)
print('vocabulary size:', len(corpus.vocabulary))
print('Vector representation of document 0:\n', corpus.vector_for_document(0))

# Instantiate a topic model
topic_model = NonNegativeMatrixFactorization(corpus)

# Estimate the optimal number of topics
print('Estimating the number of topics...')
viz = Visualization(topic_model)
viz.plot_greene_metric(min_num_topics=10,
                       max_num_topics=11,
                       tao=10, step=1,
                       top_n_words=10)
viz.plot_arun_metric(min_num_topics=5,
                     max_num_topics=30,
                     iterations=10)
viz.plot_brunet_metric(min_num_topics=5,
                       max_num_topics=30,
                       iterations=10)
Esempio n. 7
0
def buildBrowser(className, startTime, endTime):
    # Parameters
    max_tf = 0.8
    min_tf = 4
    num_topics = 7
    vectorization = 'tfidf'

    MYDIR = os.path.dirname(__file__)

    # Load corpus
    corpus = getCorpus(className, startTime, endTime)
    print('corpus size:', corpus.size)
    print('vocabulary size:', len(corpus.vocabulary))

    # Infer topics
    topic_model = NonNegativeMatrixFactorization(corpus=corpus)
    topic_model.infer_topics(num_topics=num_topics)
    topic_model.print_topics(num_words=10)

    # Save the topic model for reference
    # We'll just use a placeholder path for now
    utils.save_topic_model(topic_model, os.path.join(MYDIR, getTopicModelPath(className)))

    MYDIR = os.path.dirname(__file__)

    # Clean the data directory
    if os.path.exists(os.path.join(MYDIR, 'browser/static/data')):
        shutil.rmtree(os.path.join(MYDIR, 'browser/static/data'))
    os.makedirs(os.path.join(MYDIR, 'browser/static/data'))

    # Export topic cloud
    utils.save_topic_cloud(topic_model, os.path.join(MYDIR,'browser/static/data/topic_cloud.json'))

    # Export details about topics
    for topic_id in range(topic_model.nb_topics):
        utils.save_word_distribution(topic_model.top_words(topic_id, 20),
                                     os.path.join(MYDIR, 'browser/static/data/word_distribution') + str(topic_id) + '.tsv')
        utils.save_affiliation_repartition(topic_model.affiliation_repartition(topic_id),
                                           os.path.join(MYDIR, 'browser/static/data/affiliation_repartition') + str(topic_id) + '.tsv')

    # Export details about questions
    for doc_id in range(topic_model.corpus.size):
        utils.save_topic_distribution(topic_model.topic_distribution_for_document(doc_id),
                                      os.path.join(MYDIR, 'browser/static/data/topic_distribution_d') + str(doc_id) + '.tsv')

    # Export details about words
    for word_id in range(len(topic_model.corpus.vocabulary)):
        utils.save_topic_distribution(topic_model.topic_distribution_for_word(word_id),
                                      os.path.join(MYDIR, 'browser/static/data/topic_distribution_w') + str(word_id) + '.tsv')

    # Associate documents with topics
    topic_associations = topic_model.documents_per_topic()
Esempio n. 8
0
            vectorization=vectorization,
            max_relative_frequency=max_tf,
            min_absolute_frequency=min_tf,
            token_pattern= \
                      r'(?u)\b(?:' + \
                          r'[a-zA-ZáÁàÀäÄâÂéÉèÈëËêÊíÍìÌïÏîÎóÓòÒöÖôÔúÚùÙüÜûÛñÑçÇ\-]' + \
                          r'[a-zA-ZáÁàÀäÄâÂéÉèÈëËêÊíÍìÌïÏîÎóÓòÒöÖôÔúÚùÙüÜûÛñÑçÇ\-]+' + \
                        r'|[nNxXyYaAoOeEuU]' + \
                      r')\b',
            tokenizer=tokenizers[language]
        )
        print('corpus size:', corpus.size)
        print('vocabulary size:', len(corpus.vocabulary))

        # Infer topics
        topic_model = NonNegativeMatrixFactorization(corpus=corpus)
        topic_model.infer_topics(num_topics=int(min([num_topics, corpus.size])))
        topic_model.print_topics(num_words=10)

        # Export topic cloud
        utils.save_topic_cloud(topic_model, path.join(timeframe_dir, 'topic_cloud.json'))

        # Export details about topics
        for topic_id in range(topic_model.nb_topics):
            custom_save_word_distribution(custom_top_words(topic_model, topic_id, 20),
                                          path.join(timeframe_dir,'word_distribution' + str(topic_id) + '.tsv'))
            utils.save_affiliation_repartition(topic_model.affiliation_repartition(topic_id),
                                               path.join(timeframe_dir,
                                                         'affiliation_repartition' + str(topic_id) + '.tsv'))
            evolution = []
            for i in range(timeframe):
Esempio n. 9
0
def main(config_infer):
    # get the current datetime string for use in the output directory name
    now_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    # Data parameters
    data_dir = config_infer.get('data_dir', '', vars=os.environ)
    data_dir = data_dir or '.'
    data_dir = Path(data_dir)
    docs_filename = config_infer.get('docs_filename', '')
    if not docs_filename:
        raise ValueError(f'docs_filename not specified in {config_filepath}')
    source_filepath = data_dir / docs_filename
    if not source_filepath.exists():
        raise OSError(f'Documents file does not exist: {source_filepath}')
    # Corpus parameters
    id_col = config_infer.get('id_col', None)
    affiliation_col = config_infer.get('affiliation_col', None)
    dataset_col = config_infer.get('dataset_col', None)
    title_col = config_infer.get('title_col', None)
    author_col = config_infer.get('author_col', None)
    date_col = config_infer.get('date_col', None)
    text_col = config_infer.get('text_col', None)
    full_text_col = config_infer.get('full_text_col', None)
    corpus_name = config_infer.get('corpus_name', None)
    corpus_name = '_'.join(corpus_name.split()) if corpus_name else 'corpus'  # remove spaces
    language = config_infer.get('language', None)
    assert (isinstance(language, str) and language in ['english']) or (isinstance(language, list)) or (language is None)
    # ignore words which relative frequency is > than max_relative_frequency
    max_relative_frequency = config_infer.getfloat('max_relative_frequency', 0.8)
    # ignore words which absolute frequency is < than min_absolute_frequency
    min_absolute_frequency = config_infer.getint('min_absolute_frequency', 5)
    # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency)
    vectorization = config_infer.get('vectorization', 'tfidf')
    n_gram = config_infer.getint('n_gram', 1)
    max_features = config_infer.get('max_features', None)
    if isinstance(max_features, str):
        if max_features.isnumeric():
            max_features = int(max_features)
        elif max_features == 'None':
            max_features = None
    assert isinstance(max_features, int) or (max_features is None)
    sample = config_infer.getfloat('sample', 1.0)
    # General model parameters
    model_type = config_infer.get('model_type', 'NMF')
    verbose = config_infer.getint('verbose', 0)
    random_state = config_infer.getint('random_state', None)
    # NMF parameters
    nmf_init = config_infer.get('nmf_init', None)
    nmf_solver = config_infer.get('nmf_solver', None)
    nmf_beta_loss = config_infer.get('nmf_beta_loss', 'frobenius')
    nmf_max_iter = config_infer.getint('nmf_max_iter', None)
    nmf_alpha = config_infer.getfloat('nmf_alpha', None)
    nmf_l1_ratio = config_infer.getfloat('nmf_l1_ratio', None)
    nmf_shuffle = config_infer.getboolean('nmf_shuffle', None)
    # LDA parameters
    lda_algorithm = config_infer.get('lda_algorithm', 'variational')
    lda_alpha = config_infer.getfloat('lda_alpha', None)
    lda_eta = config_infer.getfloat('lda_eta', None)
    lda_learning_method = config_infer.get('lda_algorithm', 'batch')
    lda_n_jobs = config_infer.getint('lda_n_jobs', -1)
    lda_n_iter = config_infer.getint('lda_n_iter', None)

    # Assessment config parameters
    min_num_topics = config_infer.getint('min_num_topics', 11)
    max_num_topics = config_infer.getint('max_num_topics', 49)
    step = config_infer.getint('step', 2)
    greene_tao = config_infer.getint('greene_tao', 10)
    greene_top_n_words = config_infer.getint('greene_top_n_words', 10)
    greene_sample = config_infer.getfloat('greene_sample', 0.8)
    arun_iterations = config_infer.getint('arun_iterations', 10)
    brunet_iterations = config_infer.getint('brunet_iterations', 10)
    coherence_w2v_top_n_words = config_infer.getint('coherence_w2v_top_n_words', 10)
    coherence_w2v_size = config_infer.getint('coherence_w2v_size', 100)
    # perplexity_train_size = config_infer.getfloat('perplexity_train_size', 0.7)

    if model_type not in ['NMF', 'LDA']:
        raise ValueError(f"model_type must be 'NMF' or 'LDA', got {model_type}")

    if model_type == 'NMF':
        if (nmf_solver == 'mu') and (nmf_beta_loss not in ['frobenius', 'kullback-leibler', 'itakura-saito']):
            raise ValueError(f"For NMF, 'beta_loss' must be 'frobenius', 'kullback-leibler', or 'itakura-saito', got '{nmf_beta_loss}'")
        if vectorization == 'tf':
            raise ValueError(f"for NMF, 'vectorization' should be 'tfidf', got '{vectorization}'")
    elif model_type == 'LDA':
        if lda_algorithm not in ['variational', 'gibbs']:
            raise ValueError(f"For LDA, 'lda_algorithm' must be 'variational' or 'gibbs', got '{lda_algorithm}'")
        if vectorization == 'tfidf':
            raise ValueError(f"for LDA, 'vectorization' should be 'tf', got '{vectorization}'")

    # Load and prepare a corpus
    logger.info(f'Loading documents: {source_filepath}')
    corpus = Corpus(
        source_filepath=source_filepath,
        name=corpus_name,
        language=language,
        vectorization=vectorization,
        n_gram=n_gram,
        max_relative_frequency=max_relative_frequency,
        min_absolute_frequency=min_absolute_frequency,
        max_features=max_features,
        sample=sample,
        id_col=id_col,
        affiliation_col=affiliation_col,
        dataset_col=dataset_col,
        title_col=title_col,
        author_col=author_col,
        date_col=date_col,
        text_col=text_col,
        full_text_col=full_text_col,
    )
    logger.info(f'Corpus size: {corpus.size:,}')
    logger.info(f'Vocabulary size: {corpus.vocabulary_size:,}')

    # Initialize topic model
    if model_type == 'NMF':
        topic_model = NonNegativeMatrixFactorization(corpus=corpus)
    elif model_type == 'LDA':
        topic_model = LatentDirichletAllocation(corpus=corpus)

    # Estimate the optimal number of topics
    num_topics_infer = range(min_num_topics, max_num_topics + 1, step)
    logger.info(f'Total number of topics to infer: {len(num_topics_infer)}')
    logger.info(f'Topic numbers: {list(num_topics_infer)}')

    output_dir = f'assess_{topic_model.model_type}_{source_filepath.stem}_{now_str}'

    viz = Visualization(topic_model, output_dir=output_dir)

    logger.info('Estimating the number of topics to choose. This could take a while...')
    logger.info(f'Will save results to: {viz.output_dir}')

    logger.info('Assessing Greene metric')
    viz.plot_greene_metric(
        min_num_topics=min_num_topics,
        max_num_topics=max_num_topics,
        step=step,
        tao=greene_tao,
        top_n_words=greene_top_n_words,
        sample=greene_sample,
        random_state=random_state,
        verbose=verbose,
        nmf_init=nmf_init,
        nmf_solver=nmf_solver,
        nmf_beta_loss=nmf_beta_loss,
        nmf_max_iter=nmf_max_iter,
        nmf_alpha=nmf_alpha,
        nmf_l1_ratio=nmf_l1_ratio,
        nmf_shuffle=nmf_shuffle,
        lda_algorithm=lda_algorithm,
        lda_alpha=lda_alpha,
        lda_eta=lda_eta,
        lda_learning_method=lda_learning_method,
        lda_n_jobs=lda_n_jobs,
        lda_n_iter=lda_n_iter,
    )

    logger.info('Assessing Arun metric')
    viz.plot_arun_metric(
        min_num_topics=min_num_topics,
        max_num_topics=max_num_topics,
        step=step,
        iterations=arun_iterations,
        random_state=random_state,
        verbose=verbose,
        nmf_init=nmf_init,
        nmf_solver=nmf_solver,
        nmf_beta_loss=nmf_beta_loss,
        nmf_max_iter=nmf_max_iter,
        nmf_alpha=nmf_alpha,
        nmf_l1_ratio=nmf_l1_ratio,
        nmf_shuffle=nmf_shuffle,
        lda_algorithm=lda_algorithm,
        lda_alpha=lda_alpha,
        lda_eta=lda_eta,
        lda_learning_method=lda_learning_method,
        lda_n_jobs=lda_n_jobs,
        lda_n_iter=lda_n_iter,
    )

    logger.info('Assessing Coherence Word2Vec metric')
    viz.plot_coherence_w2v_metric(
        min_num_topics=min_num_topics,
        max_num_topics=max_num_topics,
        step=step,
        top_n_words=coherence_w2v_top_n_words,
        w2v_size=coherence_w2v_size,
        random_state=random_state,
        verbose=verbose,
        nmf_init=nmf_init,
        nmf_solver=nmf_solver,
        nmf_beta_loss=nmf_beta_loss,
        nmf_max_iter=nmf_max_iter,
        nmf_alpha=nmf_alpha,
        nmf_l1_ratio=nmf_l1_ratio,
        nmf_shuffle=nmf_shuffle,
        lda_algorithm=lda_algorithm,
        lda_alpha=lda_alpha,
        lda_eta=lda_eta,
        lda_learning_method=lda_learning_method,
        lda_n_jobs=lda_n_jobs,
        lda_n_iter=lda_n_iter,
    )

    logger.info('Assessing Brunet metric')
    viz.plot_brunet_metric(
        min_num_topics=min_num_topics,
        max_num_topics=max_num_topics,
        step=step,
        iterations=brunet_iterations,
        random_state=random_state,
        verbose=verbose,
        nmf_init=nmf_init,
        nmf_solver=nmf_solver,
        nmf_beta_loss=nmf_beta_loss,
        nmf_max_iter=nmf_max_iter,
        nmf_alpha=nmf_alpha,
        nmf_l1_ratio=nmf_l1_ratio,
        nmf_shuffle=nmf_shuffle,
        lda_algorithm=lda_algorithm,
        lda_alpha=lda_alpha,
        lda_eta=lda_eta,
        lda_learning_method=lda_learning_method,
        lda_n_jobs=lda_n_jobs,
        lda_n_iter=lda_n_iter,
    )
Esempio n. 10
0
def main(config_browser):
    # Data parameters
    data_dir = config_browser.get('data_dir', '', vars=os.environ)
    data_dir = data_dir or '.'
    data_dir = Path(data_dir)
    docs_filename = config_browser.get('docs_filename', '')
    if not docs_filename:
        raise ValueError(f'docs_filename not specified in {config_filepath}')
    source_filepath = data_dir / docs_filename
    if not source_filepath.exists():
        raise OSError(f'Documents file does not exist: {source_filepath}')
    # Corpus parameters
    id_col = config_browser.get('id_col', None)
    affiliation_col = config_browser.get('affiliation_col', None)
    dataset_col = config_browser.get('dataset_col', None)
    title_col = config_browser.get('title_col', None)
    author_col = config_browser.get('author_col', None)
    date_col = config_browser.get('date_col', None)
    text_col = config_browser.get('text_col', None)
    full_text_col = config_browser.get('full_text_col', None)
    corpus_name = config_browser.get('corpus_name', None)
    corpus_name = '_'.join(
        corpus_name.split()) if corpus_name else 'corpus'  # remove spaces
    language = config_browser.get('language', None)
    assert (isinstance(language, str)
            and language in ['english']) or (isinstance(
                language, list)) or (language is None)
    # ignore words which relative frequency is > than max_relative_frequency
    max_relative_frequency = config_browser.getfloat('max_relative_frequency',
                                                     0.8)
    # ignore words which absolute frequency is < than min_absolute_frequency
    min_absolute_frequency = config_browser.getint('min_absolute_frequency', 5)
    # 'tf' (term-frequency) or 'tfidf' (term-frequency inverse-document-frequency)
    vectorization = config_browser.get('vectorization', 'tfidf')
    n_gram = config_browser.getint('n_gram', 1)
    max_features = config_browser.get('max_features', None)
    if isinstance(max_features, str):
        if max_features.isnumeric():
            max_features = int(max_features)
        elif max_features == 'None':
            max_features = None
    assert isinstance(max_features, int) or (max_features is None)
    sample = config_browser.getfloat('sample', 1.0)
    # General model parameters
    model_type = config_browser.get('model_type', 'NMF')
    num_topics = config_browser.getint('num_topics', 15)
    verbose = config_browser.getint('verbose', 0)
    random_state = config_browser.getint('random_state', None)
    rename_topics = config_browser.get('rename_topics', None)
    rename_topics = rename_topics.split(',') if rename_topics else None
    merge_topics = config_browser.get('merge_topics', None)
    if merge_topics:
        merge_topics = {
            t.split(':')[0]: t.split(':')[1:][0].split(',')
            for t in merge_topics.split('.') if t
        }
    # must define the state if renaming or merging topics
    if rename_topics or merge_topics:
        assert random_state is not None
    load_if_existing_model = config_browser.getboolean(
        'load_if_existing_model', True)
    # NMF parameters
    nmf_init = config_browser.get('nmf_init', None)
    nmf_solver = config_browser.get('nmf_solver', None)
    nmf_beta_loss = config_browser.get('nmf_beta_loss', 'frobenius')
    nmf_max_iter = config_browser.getint('nmf_max_iter', None)
    nmf_alpha = config_browser.getfloat('nmf_alpha', None)
    nmf_l1_ratio = config_browser.getfloat('nmf_l1_ratio', None)
    nmf_shuffle = config_browser.getboolean('nmf_shuffle', None)
    # LDA parameters
    lda_algorithm = config_browser.get('lda_algorithm', 'variational')
    lda_alpha = config_browser.getfloat('lda_alpha', None)
    lda_eta = config_browser.getfloat('lda_eta', None)
    lda_learning_method = config_browser.get('lda_algorithm', 'batch')
    lda_n_jobs = config_browser.getint('lda_n_jobs', -1)
    lda_n_iter = config_browser.getint('lda_n_iter', None)
    # Web app parameters
    top_words_description = config_browser.getint('top_words_description', 10)
    top_words_cloud = config_browser.getint('top_words_cloud', 5)

    if model_type not in ['NMF', 'LDA']:
        raise ValueError(
            f"model_type must be 'NMF' or 'LDA', got {model_type}")

    if model_type == 'NMF':
        if (nmf_solver == 'mu') and (nmf_beta_loss not in [
                'frobenius', 'kullback-leibler', 'itakura-saito'
        ]):
            raise ValueError(
                f"For NMF, 'beta_loss' must be 'frobenius', 'kullback-leibler', or 'itakura-saito', got '{nmf_beta_loss}'"
            )
        if vectorization == 'tf':
            raise ValueError(
                f"for NMF, 'vectorization' should be 'tfidf', got '{vectorization}'"
            )
    elif model_type == 'LDA':
        if lda_algorithm not in ['variational', 'gibbs']:
            raise ValueError(
                f"For LDA, 'lda_algorithm' must be 'variational' or 'gibbs', got '{lda_algorithm}'"
            )
        if vectorization == 'tfidf':
            raise ValueError(
                f"for LDA, 'vectorization' should be 'tf', got '{vectorization}'"
            )

    if rename_topics:
        assert len(rename_topics) == num_topics

    # Flask Web server
    static_folder = Path('browser/static')
    template_folder = Path('browser/templates')

    # Set up directories for serving files
    tm_folder = Path(
        'data') / f'{model_type}_{source_filepath.stem}_{num_topics}_topics'
    data_folder = tm_folder / 'data'
    model_folder = tm_folder / 'model'
    topic_model_filepath = model_folder / 'model.pickle'

    # Set up sub-directories for serving files
    topic_cloud_folder = data_folder / 'topic_cloud'
    # # author_network_folder = data_folder / 'author_network'
    figs_folder = data_folder / 'figs'

    # ##################################
    # Load or train model
    # ##################################

    if load_if_existing_model and (static_folder /
                                   topic_model_filepath).exists():
        # Load model from disk:
        logger.info(
            f'Loading topic model: {static_folder / topic_model_filepath}')
        topic_model = ut.load_topic_model(static_folder / topic_model_filepath)

        # if loading a model and random_state is set, ensure they match
        if random_state:
            assert topic_model.random_state == random_state

        logger.info(f'Corpus size: {topic_model.corpus.size:,}')
        logger.info(f'Vocabulary size: {topic_model.corpus.vocabulary_size:,}')
    else:
        # Clean the topic model directory
        if (static_folder / tm_folder).exists():
            ut.delete_folder(static_folder / tm_folder)
        (static_folder / tm_folder).mkdir(parents=True, exist_ok=False)

        # Load and prepare a corpus
        logger.info(f'Loading documents: {source_filepath}')
        corpus = Corpus(
            source_filepath=source_filepath,
            name=corpus_name,
            language=language,
            vectorization=vectorization,
            n_gram=n_gram,
            max_relative_frequency=max_relative_frequency,
            min_absolute_frequency=min_absolute_frequency,
            max_features=max_features,
            sample=sample,
            id_col=id_col,
            affiliation_col=affiliation_col,
            dataset_col=dataset_col,
            title_col=title_col,
            author_col=author_col,
            date_col=date_col,
            text_col=text_col,
            full_text_col=full_text_col,
        )
        # Initialize topic model
        if model_type == 'NMF':
            topic_model = NonNegativeMatrixFactorization(corpus=corpus)
        elif model_type == 'LDA':
            topic_model = LatentDirichletAllocation(corpus=corpus)

        logger.info(f'Corpus size: {topic_model.corpus.size:,}')
        logger.info(f'Vocabulary size: {topic_model.corpus.vocabulary_size:,}')

        # Infer topics
        logger.info(f'Inferring {num_topics} topics')
        if model_type == 'NMF':
            topic_model.infer_topics(
                num_topics=num_topics,
                nmf_init=nmf_init,
                nmf_solver=nmf_solver,
                nmf_beta_loss=nmf_beta_loss,
                nmf_max_iter=nmf_max_iter,
                nmf_alpha=nmf_alpha,
                nmf_l1_ratio=nmf_l1_ratio,
                nmf_shuffle=nmf_shuffle,
                verbose=verbose,
                random_state=random_state,
            )
        elif model_type == 'LDA':
            topic_model.infer_topics(
                num_topics=num_topics,
                lda_algorithm=lda_algorithm,
                lda_alpha=lda_alpha,
                lda_eta=lda_eta,
                lda_learning_method=lda_learning_method,
                lda_n_jobs=lda_n_jobs,
                lda_n_iter=lda_n_iter,
                verbose=verbose,
                random_state=random_state,
            )

        # Save model on disk
        logger.info(f'Saving topic model: {topic_model_filepath}')
        ut.save_topic_model(topic_model, static_folder / topic_model_filepath)

    topic_cols_all = [
        ' '.join(tw)
        for tw in topic_model.top_words_topics(num_words=top_words_description)
    ]
    if rename_topics:
        rename = {tc: d for tc, d in zip(topic_cols_all, rename_topics)}
    else:
        rename = None

    # Get the top words for each topic for use around the site
    topic_description = [
        f"Topic {i:2d}: {rename_topics[i] + ' --- ' if rename_topics else None}{', '.join(tw)}"
        for i, tw in enumerate(
            topic_model.top_words_topics(num_words=top_words_description))
    ]

    # Save the top words to CSV
    num_top_words_save = 20
    logger.info(f'Saving top {num_top_words_save} words CSV and XLSX')
    top_words_filename = f'{topic_model.corpus.name}_{topic_model.nb_topics}_topics_top_{num_top_words_save}_words'
    ut.save_top_words(num_top_words_save, topic_model,
                      static_folder / data_folder / top_words_filename)

    # Get the vocabularly and split into sublists
    n_cols = 5
    words_per_col = int(ceil(topic_model.corpus.vocabulary_size / n_cols))
    split_vocabulary = [
        sublist for sublist in ut.chunks(
            [(k, v)
             for k, v in topic_model.corpus.vocabulary.items()], words_per_col)
    ]

    # Export topic cloud
    logger.info('Saving topic cloud')
    ut.save_topic_cloud(topic_model,
                        static_folder / topic_cloud_folder /
                        'topic_cloud.json',
                        top_words=top_words_cloud)

    # # Export per-topic author network using the most likely documents for each topic
    # logger.info('Saving author network details')
    # for topic_id in range(topic_model.nb_topics):
    #     ut.save_json_object(topic_model.corpus.collaboration_network(topic_model.documents_for_topic(topic_id)),
    #                         static_folder / author_network_folder / f'author_network{topic_id}.json')

    logger.info('Done.')

    # ##################################
    # Make plots for the main index page
    # ##################################

    logger.info('Creating plots...')

    # always create these images so they are up to date, and we have the paths based on the variables

    normalized = True
    thresh = 0.1
    freq = '1YS'
    ma_window = None
    savefig = True
    ncols = 7
    nchar_title = 30
    dpi = 72
    figformat = 'png'
    by_affil_list = [False, True]
    if merge_topics:
        merge_topics_list = [False, True]
    else:
        merge_topics_list = [False, False]

    viz = Visualization(topic_model, output_dir=static_folder / figs_folder)

    logger.info(f'Will save figures and figure data to: {viz.output_dir}')

    # count
    docs_over_time_count_line, docs_over_time_count_filepath = viz.plotly_docs_over_time(
        freq=freq,
        count=True,
        by_affil=True,
        ma_window=ma_window,
        output_type='div',
        savedata=True,
    )

    # percent
    docs_over_time_percent_line, docs_over_time_percent_filepath = viz.plotly_docs_over_time(
        freq=freq,
        count=False,
        by_affil=True,
        ma_window=ma_window,
        output_type='div',
        savedata=True,
    )

    # average topic loading
    topic_loading_barplot, topic_loading_filepath = viz.plotly_doc_topic_loading(
        rename=rename,
        normalized=normalized,
        n_words=top_words_description,
        output_type='div',
        savedata=True,
    )

    # topic_heatmap, topic_heatmap_filepath = viz.plotly_heatmap(
    #     rename=rename,
    #     normalized=normalized,
    #     n_words=top_words_description,
    #     annotate=True,
    #     annot_decimals=2,
    #     annot_fontsize=7,
    #     annot_fontcolor='black',
    #     output_type='div',
    #     savedata=False,
    # )

    topic_clustermap, topic_clustermap_filepath, topic_heatmap_filepath = viz.plotly_clustermap(
        rename=rename,
        normalized=normalized,
        n_words=top_words_description,
        annotate=True,
        annot_decimals=2,
        annot_fontsize=7,
        annot_fontcolor='black',
        output_type='div',
        savedata=True,
    )

    totc = []
    totp = []
    # totl = []
    for i, mt in enumerate(merge_topics_list):
        for ba in by_affil_list:
            if (not any(merge_topics_list)) and (i == 1):
                fig_topic_over_time_count = None
            else:
                _, _, fig_topic_over_time_count = viz.plot_topic_over_time_count(
                    rename=rename,
                    merge_topics=merge_topics if mt else None,
                    normalized=normalized,
                    thresh=thresh,
                    freq=freq,
                    n_words=top_words_description,
                    by_affil=ba,
                    ma_window=ma_window,
                    nchar_title=nchar_title,
                    ncols=ncols,
                    savefig=savefig,
                    dpi=dpi,
                    figformat=figformat,
                )
            totc.append(fig_topic_over_time_count)

            if (not any(merge_topics_list)) and (i == 1):
                fig_topic_over_time_percent = None
            else:
                _, _, fig_topic_over_time_percent = viz.plot_topic_over_time_percent(
                    rename=rename,
                    merge_topics=merge_topics if mt else None,
                    normalized=normalized,
                    thresh=thresh,
                    freq=freq,
                    n_words=top_words_description,
                    by_affil=ba,
                    ma_window=ma_window,
                    nchar_title=nchar_title,
                    ncols=ncols,
                    savefig=savefig,
                    dpi=dpi,
                    figformat=figformat,
                )
            totp.append(fig_topic_over_time_percent)

            # if (not any(merge_topics_list)) and (i == 1):
            #     fig_topic_over_time_loading = None
            # else:
            #     _, _, fig_topic_over_time_loading = viz.plot_topic_over_time_loading(
            #         rename=rename,
            #         merge_topics=merge_topics if mt else None,
            #         normalized=normalized,
            #         thresh=thresh,
            #         freq=freq,
            #         n_words=top_words_description,
            #         by_affil=ba,
            #         ma_window=ma_window,
            #         nchar_title=nchar_title,
            #         ncols=ncols,
            #         savefig=savefig,
            #         dpi=dpi,
            #         figformat=figformat,
            #     )
            # totl.append(fig_topic_over_time_loading)

    # _, _, fig_topic_topic_corr_heatmap = viz.plot_heatmap(
    #     rename=rename,
    #     normalized=normalized,
    #     fmt='.2f',
    #     annot_fontsize=12,
    #     n_words=top_words_description,
    #     savefig=savefig,
    #     dpi=dpi,
    #     figformat=figformat,
    # )

    _, fig_topic_topic_corr_clustermap = viz.plot_clustermap(
        rename=rename,
        normalized=normalized,
        fmt='.2f',
        annot_fontsize=12,
        n_words=top_words_description,
        savefig=savefig,
        dpi=dpi,
        figformat=figformat,
    )

    # # debug
    # fig_topic_over_time_count = ''
    # fig_topic_over_time_percent = ''
    # fig_topic_over_time_loading = ''
    # fig_topic_over_time_count_affil = ''
    # fig_topic_over_time_percent_affil = ''
    # fig_topic_over_time_loading_affil = ''
    # fig_topic_topic_corr_heatmap = ''
    # fig_topic_topic_corr_clustermap = ''

    logger.info('Done.')

    # ##################################
    # Print info
    # ##################################

    topic_model.print_topics(num_words=10)

    server = Flask(__name__,
                   static_folder=static_folder,
                   template_folder=template_folder)

    # ##################################
    # Set up topic loading similarity app
    # ##################################

    external_stylesheets = [
        'https://codepen.io/chriddyp/pen/bWLwgP.css',
    ]

    app = dash.Dash(
        __name__,
        server=server,
        routes_pathname_prefix='/topic_loading_similarity/',
        external_stylesheets=external_stylesheets,
    )

    app.title = 'Topic Loading Similarity'
    similarity_col = 'similarity'

    cols_sim = [
        similarity_col,
        topic_model.corpus._title_col,
        topic_model.corpus._dataset_col,
        topic_model.corpus._affiliation_col,
        topic_model.corpus._author_col,
        topic_model.corpus._date_col,
        id_col,
    ]
    cols_nosim = [
        c for c in cols_sim if c in topic_model.corpus.data_frame.columns
    ]

    app.layout = html.Div([
        html.Div([
            html.Div(
                html.
                P('Drag or click the sliders to describe a topic loading vector. The most similar documents are displayed below.'
                  ),
                style={'float': 'left'},
            ),
            html.Div(
                html.A('Back to topic browser', id='back-to-main', href='../'),
                style={'float': 'right'},
            ),
        ]),
        html.Div(html.P('')),
        html.Div(
            [
                html.Div([
                    html.Div(
                        dcc.Slider(
                            id=f'slider-topic-{n}',
                            min=0.0,
                            max=1.0,
                            step=0.1,
                            value=0.0,  # starting value
                            updatemode='drag',
                        ),
                        style={
                            'width': '20%',
                            'display': 'inline-block',
                        },
                    ),
                    html.Div(
                        id=f'slider-output-container-{n}',
                        style={
                            'marginLeft': 10,
                            'marginRight': 5,
                            'font-size': 'small',
                            'display': 'inline-block',
                        },
                    ),
                    html.Div(
                        html.Label(topic_description[n]),
                        style={
                            'font-weight': 'bold',
                            'font-size': 'small',
                            'width': '75%',
                            'display': 'inline-block',
                        },
                    ),
                ]) for n in range(topic_model.nb_topics)
            ],
            style={
                'width': '100%',
                'display': 'inline-block'
            },
        ),
        html.Label('Number of documents to display'),
        html.Div(
            dcc.Dropdown(
                id='num-docs-dropdown',
                options=[
                    {
                        'label': '10',
                        'value': 10
                    },
                    {
                        'label': '50',
                        'value': 50
                    },
                    {
                        'label': '100',
                        'value': 100
                    },
                    {
                        'label': '200',
                        'value': 200
                    },
                    {
                        'label': 'All',
                        'value': topic_model.corpus.size
                    },
                ],
                value=10,
                placeholder='Select...',
            ),
            style={
                'width': '10%',
                'display': 'inline-block',
            },
        ),
        html.Div(
            html.A(
                html.Button('Export to CSV'),
                id='download-link',
                download=f'{corpus_name}_topic_loading_similarity.csv',
                href='',
                target='_blank',
            ),
            style={
                'display': 'inline-block',
                'float': 'right',
            },
        ),
        html.Div([
            dt.DataTable(
                id='doc-table',
                data=[],
                columns=[{
                    "name": i,
                    "id": i
                } for i in cols_sim],
                style_table={'overflowX': 'scroll'},
                style_cell={
                    'minWidth': '0px',
                    'maxWidth': '250px',
                    'whiteSpace': 'normal'
                },
                style_cell_conditional=[
                    {
                        'if': {
                            'column_id': similarity_col
                        },
                        'width': '7%'
                    },
                    {
                        'if': {
                            'column_id': topic_model.corpus._title_col
                        },
                        'width': '39%'
                    },
                    {
                        'if': {
                            'column_id': topic_model.corpus._dataset_col
                        },
                        'width': '6%'
                    },
                    {
                        'if': {
                            'column_id': topic_model.corpus._affiliation_col
                        },
                        'width': '14%'
                    },
                    {
                        'if': {
                            'column_id': topic_model.corpus._author_col
                        },
                        'width': '12%'
                    },
                    {
                        'if': {
                            'column_id': topic_model.corpus._date_col
                        },
                        'width': '7%'
                    },
                    {
                        'if': {
                            'column_id': id_col
                        },
                        'width': '15%'
                    },
                ],
                style_data_conditional=[{
                    'if': {
                        'row_index': 'odd'
                    },
                    'backgroundColor': 'rgb(248, 248, 248)'
                }],
                style_header={
                    'backgroundColor': 'rgb(230, 230, 230)',
                    'fontWeight': 'bold'
                },
                css=[{
                    'selector':
                    '.dash-cell div.dash-cell-value',
                    'rule':
                    'display: inline; white-space: inherit; overflow: inherit; text-overflow: inherit;'
                }],
                editable=False,
                row_deletable=False,
                filter_action='native',
                sort_action='native',
                page_action='native',
                page_current=0,
                page_size=100,
                style_as_list_view=False,
            ),
        ]),
    ])

    for n in range(topic_model.nb_topics):

        @app.callback(
            Output(f'slider-output-container-{n}', 'children'),
            [Input(f'slider-topic-{n}', 'value')],
        )
        def update_output(slider_n_value):
            return f'{slider_n_value:.1f}'

    def filter_data(vector, num_docs=None, round_decimal=None):
        if not num_docs:
            num_docs = 10
        if not round_decimal:
            round_decimal = 4
        doc_ids_sims = topic_model.similar_documents(vector, num_docs=num_docs)
        doc_ids = [x[0] for x in doc_ids_sims]
        result = topic_model.corpus.data_frame.reindex(columns=cols_nosim,
                                                       index=doc_ids)
        result[similarity_col] = [
            round(x[1], round_decimal) for x in doc_ids_sims
        ]
        result[topic_model.corpus._date_col] = result[
            topic_model.corpus._date_col].dt.strftime('%Y-%m-%d')
        return result

    @app.callback(
        Output('doc-table', 'data'),
        [
            Input(f'slider-topic-{n}', 'value')
            for n in range(topic_model.nb_topics)
        ] + [Input('num-docs-dropdown', 'value')],
    )
    def update_table(*args):
        vector = list(args[:-1])
        num_docs = args[-1]
        return filter_data(vector, num_docs).to_dict('records')

    @app.callback(
        Output('download-link', 'href'),
        [
            Input(f'slider-topic-{n}', 'value')
            for n in range(topic_model.nb_topics)
        ] + [Input('num-docs-dropdown', 'value')],
    )
    def update_download_link(*args):
        vector = list(args[:-1])
        num_docs = args[-1]
        return 'data:text/csv;charset=utf-8,%EF%BB%BF' + urllib.parse.quote(
            filter_data(vector, num_docs).to_csv(index=False,
                                                 encoding='utf-8'))

    # ##################################
    # Serve pages
    # ##################################

    @server.route('/')
    def index():
        return render_template(
            'index.html',
            topic_ids=topic_description,
            doc_ids=range(topic_model.corpus.size),
            method=type(topic_model).__name__,
            corpus_name=corpus_name,
            corpus_size=topic_model.corpus.size,
            vocabulary_size=topic_model.corpus.vocabulary_size,
            max_relative_frequency=max_relative_frequency,
            min_absolute_frequency=min_absolute_frequency,
            vectorization=vectorization,
            num_topics=num_topics,
            random_state=topic_model.random_state,
            top_words_csv=data_folder / f'{top_words_filename}.csv',
            top_words_xlsx=data_folder / f'{top_words_filename}.xlsx',
            docs_over_time_count_line=docs_over_time_count_line,
            docs_over_time_count_filepath=figs_folder /
            docs_over_time_count_filepath,
            docs_over_time_percent_line=docs_over_time_percent_line,
            docs_over_time_percent_filepath=figs_folder /
            docs_over_time_percent_filepath,
            topic_loading_barplot=topic_loading_barplot,
            topic_loading_filepath=figs_folder / topic_loading_filepath,
            # topic_heatmap=topic_heatmap,
            topic_clustermap=topic_clustermap,
            topic_clustermap_filepath=figs_folder / topic_clustermap_filepath,
            topic_heatmap_filepath=figs_folder / topic_heatmap_filepath,
            fig_topic_over_time_count=figs_folder / totc[0] if totc[0] else
            None,  # count, original topics, combined affiliations
            fig_topic_over_time_percent=figs_folder / totp[0] if totp[0] else
            None,  # percent, original topics, combined affiliations
            # fig_topic_over_time_loading=figs_folder / totl[0] if totl[0] else None,  # loading, original topics, combined affiliations
            fig_topic_over_time_count_affil=figs_folder / totc[1]
            if totc[1] else None,  # count, original topics, split affiliations
            fig_topic_over_time_percent_affil=figs_folder / totp[1] if totp[1]
            else None,  # percent, original topics, split affiliations
            # fig_topic_over_time_loading_affil=figs_folder / totl[1] if totl[1] else None,  # loading, original topics, split affiliations
            fig_topic_over_time_count_merged=figs_folder / totc[2] if totc[2]
            else None,  # count, merged topics, combined affiliations
            fig_topic_over_time_percent_merged=figs_folder / totp[2] if totp[2]
            else None,  # percent, merged topics, combined affiliations
            # fig_topic_over_time_loading_merged=figs_folder / totl[2] if totl[2] else None,  # loading, merged topics, combined affiliations
            fig_topic_over_time_count_affil_merged=figs_folder / totc[3]
            if totc[3] else None,  # count, merged topics, split affiliations
            fig_topic_over_time_percent_affil_merged=figs_folder / totp[3]
            if totp[3] else None,  # percent, merged topics, split affiliations
            # fig_topic_over_time_loading_affil_merged=figs_folder / totl[3] if totl[3] else None,  # loading, merged topics, split affiliations
            # fig_topic_topic_corr_heatmap=figs_folder / fig_topic_topic_corr_heatmap,
            fig_topic_topic_corr_clustermap=figs_folder /
            fig_topic_topic_corr_clustermap,
        )

    @server.route('/topic_cloud.html')
    def topic_cloud():
        return render_template(
            'topic_cloud.html',
            topic_ids=topic_description,
            doc_ids=range(topic_model.corpus.size),
            topic_cloud_filename=topic_cloud_folder / 'topic_cloud.json',
        )

    @server.route('/vocabulary.html')
    def vocabulary():
        return render_template(
            'vocabulary.html',
            topic_ids=topic_description,
            split_vocabulary=split_vocabulary,
            vocabulary_size=topic_model.corpus.vocabulary_size,
        )

    @server.route('/topic/<tid>.html')
    def topic_details(tid: str):
        tid = int(tid)
        # get the most likely documents per topic
        ids = topic_model.documents_for_topic(tid)
        # # get the top 100 documents per topic
        # ids = list(topic_model.top_topic_docs(topics=tid, top_n=100))[0][1]
        documents = []
        for i, document_id in enumerate(ids):
            documents.append((
                i + 1,
                topic_model.corpus.title(document_id).title(),
                ', '.join(topic_model.corpus.dataset(document_id)).title(),
                ', '.join(topic_model.corpus.affiliation(document_id)).title(),
                ', '.join(topic_model.corpus.author(document_id)).title(),
                topic_model.corpus.date(document_id).strftime('%Y-%m-%d'),
                topic_model.corpus.id(document_id),
                document_id,
            ), )

        topic_word_weight_barplot, _ = viz.plotly_topic_word_weight(
            tid,
            normalized=True,
            n_words=20,
            output_type='div',
            savedata=False)
        topic_over_time_percent_line, _ = viz.plotly_topic_over_time(
            tid, count=False, output_type='div', savedata=False)
        topic_affiliation_count_barplot, _ = viz.plotly_topic_affiliation_count(
            tid, output_type='div', savedata=False)

        return render_template(
            'topic.html',
            topic_id=tid,
            description=
            f"{tid}{': ' + rename_topics[tid] if rename_topics else None}",
            frequency=round(topic_model.topic_frequency(tid) * 100, 2),
            documents=documents,
            topic_ids=topic_description,
            doc_ids=range(topic_model.corpus.size),
            topic_word_weight_barplot=topic_word_weight_barplot,
            topic_over_time_percent_line=topic_over_time_percent_line,
            topic_affiliation_count_barplot=topic_affiliation_count_barplot,
            # author_network_filename=author_network_folder / f'author_network{tid}.json',
        )

    @server.route('/document/<did>.html')
    def document_details(did: str):
        did = int(did)
        vector = topic_model.corpus.word_vector_for_document(did)
        word_list = []
        for a_word_id in range(len(vector)):
            word_list.append((topic_model.corpus.word_for_id(a_word_id),
                              round(vector[a_word_id], 3), a_word_id))
        word_list = sorted(word_list, key=lambda x: x[1], reverse=True)
        documents = []
        for another_doc in topic_model.corpus.similar_documents(did, 5):
            documents.append((
                topic_model.corpus.title(another_doc[0]).title(),
                ', '.join(topic_model.corpus.author(another_doc[0])).title(),
                topic_model.corpus.date(another_doc[0]).strftime('%Y-%m-%d'),
                ', '.join(topic_model.corpus.affiliation(
                    another_doc[0])).title(),
                ', '.join(topic_model.corpus.dataset(another_doc[0])).title(),
                another_doc[0],
                round(another_doc[1], 3),
            ), )

        doc_topic_loading_barplot, _ = viz.plotly_doc_topic_loading(
            did,
            rename=rename,
            normalized=True,
            n_words=top_words_description,
            output_type='div',
            savedata=False,
        )

        return render_template(
            'document.html',
            doc_id=did,
            words=word_list[:21],
            topic_ids=topic_description,
            doc_ids=range(topic_model.corpus.size),
            documents=documents,
            title=topic_model.corpus.title(did).title(),
            authors=', '.join(topic_model.corpus.author(did)).title(),
            year=topic_model.corpus.date(did).strftime('%Y-%m-%d'),
            short_content=topic_model.corpus.title(did).title(),
            affiliation=', '.join(topic_model.corpus.affiliation(did)).title(),
            dataset=', '.join(topic_model.corpus.dataset(did)).title(),
            id=topic_model.corpus.id(did),
            full_text=topic_model.corpus.full_text(did),
            doc_topic_loading_barplot=doc_topic_loading_barplot,
        )

    @server.route('/word/<wid>.html')
    def word_details(wid: str):
        wid = int(wid)
        documents = []
        for document_id in topic_model.corpus.docs_for_word(wid, sort=True):
            documents.append((
                topic_model.corpus.title(document_id).title(),
                ', '.join(topic_model.corpus.author(document_id)).title(),
                topic_model.corpus.date(document_id).strftime('%Y-%m-%d'),
                ', '.join(topic_model.corpus.affiliation(document_id)).title(),
                ', '.join(topic_model.corpus.dataset(document_id)).title(),
                document_id,
            ), )

        word_topic_loading_barplot, _ = viz.plotly_word_topic_loading(
            wid,
            rename=rename,
            normalized=True,
            n_words=top_words_description,
            output_type='div',
            savedata=False,
        )

        return render_template(
            'word.html',
            word_id=wid,
            word=topic_model.corpus.word_for_id(wid),
            topic_ids=topic_description,
            doc_ids=range(topic_model.corpus.size),
            documents=documents,
            word_topic_loading_barplot=word_topic_loading_barplot,
        )

    @app.server.route('/favicon.ico')
    def favicon():
        return send_from_directory(static_folder / 'images',
                                   request.path[1:],
                                   mimetype='image/vnd.microsoft.icon')

    @server.route('/robots.txt')
    def robots_txt():
        return send_from_directory(static_folder, request.path[1:])

    # @server.url_defaults
    # def hashed_static_file(endpoint, values):
    #     """Flask: add static file's cache invalidator param (last modified time)
    #     to URLs generated by url_for(). Blueprints aware.
    #     """
    #     if 'static' == endpoint or endpoint.endswith('.static'):
    #         filename = values.get('filename')
    #         if filename:
    #             blueprint = request.blueprint
    #             if '.' in endpoint:  # blueprint
    #                 blueprint = endpoint.rsplit('.', 1)[0]

    #             static_folder = server.static_folder
    #             # use blueprint, but dont set `static_folder` option
    #             if blueprint and server.blueprints[blueprint].static_folder:
    #                 static_folder = server.blueprints[blueprint].static_folder

    #             fp = Path(static_folder, filename)
    #             if fp.exists():
    #                 values['_'] = int(fp.stat().st_mtime)

    return app