def get_avg_coherence(df, n_topics):
    print '{} Topics Processing...'.format(n_topics)
    nmf, X, W, W_percent, labels, topic_words, feature_names, reverse_lookup = nmf_articles(df, n_topics=n_topics, n_features=10000, random_state=1, max_df=0.8, min_df=5)
    print 'Factorizing Done...'
    pbar = ProgressBar()
    coherence = []
    for words in pbar(topic_words):
        coherence.append(topic_coherence(X, reverse_lookup, words))
    print '\n'
    return np.mean(coherence)
def get_avg_coherence(df, n_topics):
    print '{} Topics Processing...'.format(n_topics)
    nmf, X, W, W_percent, labels, topic_words, feature_names, reverse_lookup = nmf_articles(
        df,
        n_topics=n_topics,
        n_features=10000,
        random_state=1,
        max_df=0.8,
        min_df=5)
    print 'Factorizing Done...'
    pbar = ProgressBar()
    coherence = []
    for words in pbar(topic_words):
        coherence.append(topic_coherence(X, reverse_lookup, words))
    print '\n'
    return np.mean(coherence)
Esempio n. 3
0
        fig = plt.figure(figsize=figsize)
        ax = fig.add_subplot(111)
    ax.imshow(wc)
    ax.axis('off')


if __name__ == '__main__':
    df = pd.read_pickle('election_data.pkl')

    # Plot % of articles mentioning candidate accross all news sources
    # plot_candidate_percentages(df, ['Clinton', 'Trump', 'Bush'])

    nmf, X, W, W_percent, labels, topic_words, feature_names, reverse_lookup = nmf_articles(
        df,
        n_topics=90,
        n_features=10000,
        random_state=1,
        max_df=0.8,
        min_df=5)

    outlets = [('nyt', 'NYT', '#4c72b0'), ('foxnews', 'FOX', '#c44e52'),
               ('npr', 'NPR', '#55a868'), ('guardian', 'GUA', '#8172b2'),
               ('wsj', 'WSJ', '#ccb974')]

    # predominant_source = print_topic_summary(df, labels, outlets, topic_words)

    # Create a dictionary with the topic labels for creating the plots
    topic_labels = get_topic_labels()

    # path = './topic_plots/'
    # for idx in xrange(90):
Esempio n. 4
0
    # Create the matplotlib figure and axis if they weren't passed in
    if not ax:
        fig = plt.figure(figsize=figsize)
        ax = fig.add_subplot(111)
    ax.imshow(wc)
    ax.axis('off')


if __name__=='__main__':
    df = pd.read_pickle('election_data.pkl')

    # Plot % of articles mentioning candidate accross all news sources
    # plot_candidate_percentages(df, ['Clinton', 'Trump', 'Bush'])

    nmf, X, W, W_percent, labels, topic_words, feature_names, reverse_lookup = nmf_articles(df, n_topics=90, n_features=10000, random_state=1, max_df=0.8, min_df=5)

    outlets = [('nyt', 'NYT', '#4c72b0'), ('foxnews', 'FOX', '#c44e52'), ('npr', 'NPR', '#55a868'), ('guardian', 'GUA', '#8172b2'), ('wsj', 'WSJ', '#ccb974')]

    # predominant_source = print_topic_summary(df, labels, outlets, topic_words)

    # Create a dictionary with the topic labels for creating the plots
    topic_labels = get_topic_labels()

    # path = './topic_plots/'
    # for idx in xrange(90):
    #     # If the topic is junk, skip making the plot
    #     if topic_labels[idx] == 'junk':
    #         print '\n'
    #         continue
    #     print 'Topic {}: {}'.format(str(idx), topic_labels[idx])