def get_avg_coherence(df, n_topics): print '{} Topics Processing...'.format(n_topics) nmf, X, W, W_percent, labels, topic_words, feature_names, reverse_lookup = nmf_articles(df, n_topics=n_topics, n_features=10000, random_state=1, max_df=0.8, min_df=5) print 'Factorizing Done...' pbar = ProgressBar() coherence = [] for words in pbar(topic_words): coherence.append(topic_coherence(X, reverse_lookup, words)) print '\n' return np.mean(coherence)
def get_avg_coherence(df, n_topics): print '{} Topics Processing...'.format(n_topics) nmf, X, W, W_percent, labels, topic_words, feature_names, reverse_lookup = nmf_articles( df, n_topics=n_topics, n_features=10000, random_state=1, max_df=0.8, min_df=5) print 'Factorizing Done...' pbar = ProgressBar() coherence = [] for words in pbar(topic_words): coherence.append(topic_coherence(X, reverse_lookup, words)) print '\n' return np.mean(coherence)
fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) ax.imshow(wc) ax.axis('off') if __name__ == '__main__': df = pd.read_pickle('election_data.pkl') # Plot % of articles mentioning candidate accross all news sources # plot_candidate_percentages(df, ['Clinton', 'Trump', 'Bush']) nmf, X, W, W_percent, labels, topic_words, feature_names, reverse_lookup = nmf_articles( df, n_topics=90, n_features=10000, random_state=1, max_df=0.8, min_df=5) outlets = [('nyt', 'NYT', '#4c72b0'), ('foxnews', 'FOX', '#c44e52'), ('npr', 'NPR', '#55a868'), ('guardian', 'GUA', '#8172b2'), ('wsj', 'WSJ', '#ccb974')] # predominant_source = print_topic_summary(df, labels, outlets, topic_words) # Create a dictionary with the topic labels for creating the plots topic_labels = get_topic_labels() # path = './topic_plots/' # for idx in xrange(90):
# Create the matplotlib figure and axis if they weren't passed in if not ax: fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) ax.imshow(wc) ax.axis('off') if __name__=='__main__': df = pd.read_pickle('election_data.pkl') # Plot % of articles mentioning candidate accross all news sources # plot_candidate_percentages(df, ['Clinton', 'Trump', 'Bush']) nmf, X, W, W_percent, labels, topic_words, feature_names, reverse_lookup = nmf_articles(df, n_topics=90, n_features=10000, random_state=1, max_df=0.8, min_df=5) outlets = [('nyt', 'NYT', '#4c72b0'), ('foxnews', 'FOX', '#c44e52'), ('npr', 'NPR', '#55a868'), ('guardian', 'GUA', '#8172b2'), ('wsj', 'WSJ', '#ccb974')] # predominant_source = print_topic_summary(df, labels, outlets, topic_words) # Create a dictionary with the topic labels for creating the plots topic_labels = get_topic_labels() # path = './topic_plots/' # for idx in xrange(90): # # If the topic is junk, skip making the plot # if topic_labels[idx] == 'junk': # print '\n' # continue # print 'Topic {}: {}'.format(str(idx), topic_labels[idx])