def write_exemplary_docs(doc_topics, model_dir, k, corpus_data, num_docs=50):

    # For each topic, find which documents that topic has the highest proportion in.
    for topic_index in range(k):

        # Make sure directory for topic exists
        topic_dir = corpus_processing.make_dir(
            os.path.join(model_dir, 'topic_' + f'{topic_index:02d}'))

        # Sort document indices from corpus from largest to lowest topic probabilities. doc_topics[:, topic_index]
        # specifies the topic column from the document-topic matrix. This column is sorted from largest to smallest and
        # the row/document indices are returned in this order.
        sorted_doc_indices = np.argsort(-doc_topics[:, topic_index])[:num_docs]
        sorted_topic_probs = [
            doc_topics[doc_indx, topic_index]
            for doc_indx in sorted_doc_indices
        ]

        # Specify the path for the topic submission file.
        topic_doclist_path = os.path.join(
            topic_dir, 'topic_' + f'{topic_index:02d}' + '_document_list.csv')

        # Write exemplary documents for each topic to file.
        with open(topic_doclist_path, 'w', newline='',
                  encoding='utf-8') as ofile:
            fwriter = csv.writer(ofile)

            fwriter.writerow([
                'submission_rank', 'subreddit', 'topic_probability',
                'submission_date', 'submission_id', 'submission_url'
            ])

            # For each exemplary document's index, write some useful info about the document to a CSV row.
            for i, doc_index in enumerate(sorted_doc_indices):

                # Exemplar document rank for the submission.
                subm_rank = i + 1

                # Submisison info for this exemplary document.
                doc_data = corpus_data[doc_index]

                # Write information for exemplary document to CSV row
                fwriter.writerow([
                    subm_rank, doc_data['subreddit'], sorted_topic_probs[i],
                    doc_data['submission_date'], doc_data['submission_id'],
                    doc_data['submission_url']
                ])
def write_top_words(model, model_dir, k, num_words=500):

    for topic_index in range(k):

        # Make directory to store all topic-specific data
        topic_dir = corpus_processing.make_dir(
            os.path.join(model_dir, 'topic_' + f'{topic_index:02d}'))

        # Get the highest probability terms from this topic
        topic_words = model.show_topic(topic_index, num_words)

        # Path to CSV file that will hold the topic words and probabilities:
        wordlist_fname = os.path.join(
            topic_dir, 'topic_' + f'{topic_index:02d}' + '_word_list.csv')

        # Write top terms to csv file.
        with open(wordlist_fname, 'w', newline='', encoding='utf-8') as ofile:
            f_writer = csv.writer(ofile)

            f_writer.writerow(['word', 'probability'])

            for (word, prob) in topic_words:
                f_writer.writerow([word, prob])
Beispiel #3
0
        fwriter.writerow(['word', 'count', 'prop_of_total_tokens'])

        for word, count in fd.most_common(len(fd)):
            prop_of_total_tokens = float(count) / float(t_count)
            fwriter.writerow([word, count, prop_of_total_tokens])


if __name__ == '__main__':
    # Subreddits to be analyzed.
    subreddit_list = ['buddhism', 'christianity', 'religion', 'math']

    # Directories
    cwd = os.getcwd()
    project_dir = os.path.dirname(cwd)
    data_dir = os.path.join(project_dir, '0_data')
    exp_dir = corpus_processing.make_dir(os.path.join(cwd, 'exp1'))
    stats_dir = corpus_processing.make_dir(
        os.path.join(exp_dir, 'corpus_stats'))

    # Open list of bot users to ignore (optional)
    ignore_users = list(
        set(
            open('ignore_users.txt', 'r',
                 encoding='utf-8').read().split(',\n')))

    # Iterate though each subreddit to get some general statistics about its submissions.
    for subreddit in subreddit_list:
        # Directory for subreddit-specific results
        sub_stats_dir = corpus_processing.make_dir(
            os.path.join(stats_dir, subreddit))
    # Specify the two models to be compared.
    model_1_corpus_name = 'buddhism'
    model_1_name = '001_k-30'

    model_2_corpus_name = 'buddhism'
    model_2_name = '002_k-30'

    # Directories
    cwd = os.getcwd()  # Directory of this program
    project_dir = os.path.dirname(cwd)  # Main project directory
    data_dir = os.path.join(project_dir, '0_data')  # Data directory
    exp_dir = os.path.join(cwd, experiment_name)
    corpora_dir = os.path.join(exp_dir, '2_corpora')
    lda_dir = os.path.join(exp_dir, '3_lda')
    comparison_dir = corpus_processing.make_dir(
        os.path.join(exp_dir, '9_topic_comparisons'))

    # Create a directory for storing any comparisons made between the two models
    results_dir = corpus_processing.make_dir(
        os.path.join(
            comparison_dir, model_1_corpus_name + '_' + model_1_name + '-' +
            model_2_corpus_name + '_' + model_2_name))

    # load both models
    lda_model_1 = topic_modeling.load_model(
        os.path.join(lda_dir, model_1_corpus_name, model_1_name, 'model_files',
                     'lda_model'))

    lda_model_2 = topic_modeling.load_model(
        os.path.join(lda_dir, model_2_corpus_name, model_2_name, 'model_files',
                     'lda_model'))
Beispiel #5
0
if __name__ == '__main__':
    experiment_name = 'exp1'

    comparisons = [('buddhism', 'christianity'), ('buddhism', 'math'),
                   ('buddhism', 'religion'), ('christianity', 'math'),
                   ('christianity', 'religion')]

    # Directories
    cwd = os.getcwd()  # Directory of this program
    project_dir = os.path.dirname(cwd)  # Main project directory
    data_dir = os.path.join(project_dir, '0_data')  # Data directory
    exp_dir = os.path.join(cwd, experiment_name)
    corpora_dir = os.path.join(exp_dir, '2_corpora')

    comparison_dir = corpus_processing.make_dir(
        os.path.join(exp_dir, '4_comparisons'))

    dictionary_path = os.path.join(exp_dir, 'dictionary.dict')
    dictionary = corpus_processing.load_dictionary(dictionary_path)

    for (subreddit_1, subreddit_2) in comparisons:
        sub_comp_dir = corpus_processing.make_dir(
            os.path.join(comparison_dir, subreddit_1 + '-' + subreddit_2))

        word_freqs_path_1 = os.path.join(corpora_dir, subreddit_1,
                                         'word_freqs.csv')
        word_freqs_path_2 = os.path.join(corpora_dir, subreddit_2,
                                         'word_freqs.csv')

        if os.path.exists(word_freqs_path_1):
            word_counts_1, word_props_1 = corpus_processing.load_word_freqs_and_props(
Beispiel #6
0
    assert len(model_ids) == len(k_list) == len(random_states)

    # Fixed modeling parameters
    passes = 20
    eval_every = None
    iterations = 400

    subreddit_list = ['buddhism', 'christianity', 'math', 'religion']

    # Directories
    cwd = os.getcwd()  # Directory of this program
    project_dir = os.path.dirname(cwd)  # Main project directory
    exp_dir = os.path.join(cwd, experiment_name)
    corpora_dir = os.path.join(exp_dir, '2_corpora')

    lda_dir = corpus_processing.make_dir(os.path.join(exp_dir, '3_lda'))

    # Dictionary is universal for each subreddit in this experiment, so only need to read it in once.
    dictionary_path = os.path.join(exp_dir, 'dictionary.dict')
    dictionary = corpus_processing.load_dictionary(dictionary_path)

    # Train model for each subreddit and write all relevant information to files.
    for subreddit in subreddit_list:
        print(subreddit)

        # Load corpus.
        corpus_path = os.path.join(corpora_dir, subreddit, 'corpus.mm')
        corpus = corpus_processing.load_corpus(corpus_path)

        # Load corpus data.
        corpus_data_path = os.path.join(corpora_dir, subreddit,
Beispiel #7
0
import os
import itertools
import corpus_processing

if __name__ == '__main__':
    experiment_name = 'exp1'

    subreddit_list = ['buddhism', 'christianity', 'math', 'religion']

    # Directories
    cwd = os.getcwd()  # Directory of this program
    project_dir = os.path.dirname(cwd)  # Main project directory
    data_dir = os.path.join(project_dir, '0_data')  # Data directory

    # Create directory for storing experiment-specific results
    exp_dir = corpus_processing.make_dir(os.path.join(cwd, experiment_name))

    # Create directory for storing each bag-of-words object
    corpora_dir = corpus_processing.make_dir(os.path.join(
        exp_dir, '2_corpora'))

    # Read in pre-made set of stopwords
    stoplist = corpus_processing.load_stoplist('stoplist.txt')

    # Open list of bot users to ignore (optional)
    ignore_users = list(
        set(
            open('ignore_users.txt', 'r',
                 encoding='utf-8').read().lower().split(',\n')))

    # For each subreddit, create a subreddit-specific dictionary object with the following: