def __init__(self, df, nmf=None, num_topics=None, figsize=(14, 8)): ''' init docstring INPUT: df: nmf: num_topics: Explain how it works if you don't pass an nmf object ''' self.df = df if isinstance(nmf, NMFClustering): # Check to make sure that the object has been fit to the data if not hasattr(nmf, 'W_matrix'): nmf.fit(df) self.nmf = nmf elif num_topics: self.nmf = NMFClustering(num_topics) self.nmf.fit(df) else: raise ValueError( "You must either supply a NMFClustering object or specify the number of topics!" ) self.labels = get_topic_labels() self.outlet_sizes = [ len(df.loc[df['source'] == outlet]) for outlet in zip(*self.nmf.outlets)[0] ] self.candidate_info = get_candidate_info() self.frequency = {'D': 'Daily', 'W': 'Weekly', 'M': 'Monthly'} self.figsize = figsize
def nmf_similarity(df, num_topics): print 'Processing {} Topics...'.format(num_topics) nmf = NMFClustering(num_topics, tfidf_max_features=5000, tfidf_max_df=0.75, random_state=42) nmf.fit(df) print 'Clustering Done...' pbar = ProgressBar() tfidf_similarity = [] num_zero = 0 for topic in pbar(xrange(num_topics)): # When looking at high numbers of topics, it is possible for no points to be assigned to that topic, in which case pairwise_distances() will throw an error. The label should also be skipped if only one article is assigned as pairwise_distances will return nan if nmf.labels[:, topic].sum() > 1: cosine_dist = pairwise_distances(nmf.tfidf_matrix[nmf.labels[:, topic]], metric='cosine', n_jobs=-1) idx = np.tril_indices(cosine_dist.shape[0], k=-1) tfidf_similarity.append(1 - cosine_dist[idx].mean()) else: num_zero += 1 topic_similarity = pairwise_distances(nmf.nmf.components_, metric='cosine', n_jobs=1) idx = np.tril_indices(topic_similarity.shape[0], k=-1) if num_zero: print 'Number of Empty Topics: {}'.format(num_zero) print '\n' return np.mean(tfidf_similarity), 1 - topic_similarity[idx].mean(), nmf.nmf.reconstruction_err_, num_zero
def __init__(self, df, nmf=None, num_topics=None, figsize=(14,8)): ''' init docstring INPUT: df: nmf: num_topics: Explain how it works if you don't pass an nmf object ''' self.df = df if isinstance(nmf, NMFClustering): # Check to make sure that the object has been fit to the data if not hasattr(nmf, 'W_matrix'): nmf.fit(df) self.nmf = nmf elif num_topics: self.nmf = NMFClustering(num_topics) self.nmf.fit(df) else: raise ValueError("You must either supply a NMFClustering object or specify the number of topics!") self.labels = get_topic_labels() self.outlet_sizes = [len(df.loc[df['source'] == outlet]) for outlet in zip(*self.nmf.outlets)[0]] self.candidate_info = get_candidate_info() self.frequency = {'D': 'Daily', 'W': 'Weekly', 'M': 'Monthly'} self.figsize = figsize
class ElectionPlotting(object): def __init__(self, df, nmf=None, num_topics=None, figsize=(14,8)): ''' init docstring INPUT: df: nmf: num_topics: Explain how it works if you don't pass an nmf object ''' self.df = df if isinstance(nmf, NMFClustering): # Check to make sure that the object has been fit to the data if not hasattr(nmf, 'W_matrix'): nmf.fit(df) self.nmf = nmf elif num_topics: self.nmf = NMFClustering(num_topics) self.nmf.fit(df) else: raise ValueError("You must either supply a NMFClustering object or specify the number of topics!") self.labels = get_topic_labels() self.outlet_sizes = [len(df.loc[df['source'] == outlet]) for outlet in zip(*self.nmf.outlets)[0]] self.candidate_info = get_candidate_info() self.frequency = {'D': 'Daily', 'W': 'Weekly', 'M': 'Monthly'} self.figsize = figsize def article_count_by_time(self, topic_num=None, searchterm=None, source=False, freq='W', normalize=False, marker='o', year=False, fig=None, title=None, legend_label=None): ''' Creates a plot of articles over time by count or frequency INPUT: topic_num: int (default None) searchterm: str (default None) source: bool (default False) freq: 'D', 'W', or 'M' (default 'W') normalize: bool (default False) marker: str (default 'o') year: bool (default False) Indicates whether to set xlims to encompass the entirety of the year fig: Matplotlib figure object or tuple (default None) If figure object, plot will be added to the existing figure. If tuple, figure will be created using specified size. If None, figure will be created using self.figsize. title: str (default None) legend_label: bool (default None) ''' if isinstance(fig, tuple): fig = self._create_fig(fig) elif not fig: fig = self._create_fig() # If a specific topic was given set the label and subset the dataframe if topic_num: label = self.labels.get(topic_num, 'Unknown') df = self.df.loc[self.nmf.labels[:, topic_num]] else: df = self.df # Subset the dataframe if a searchterm is provided if searchterm: df = df.loc[df['lemmatized_text'].str.contains(searchterm)] # If source is set, split up into a line for each news source if source: timeseries = [pd.Series([1], index=df.loc[df['source'] == outlet, 'date_published']).resample(freq).sum().fillna(0) for outlet in zip(*self.nmf.outlets)[0]] if normalize: timeseries = [ts / outlet_size for ts, outlet_size in zip(timeseries, self.outlet_sizes)] plt.ylabel('Article Frequency (freq = {})'.format(freq), fontsize=12) else: plt.ylabel('Article Count (Freq = {})'.format(freq), fontsize=12) # plt.subplots_adjust(left=0.08, bottom=0.12, right=0.95, top=0.92) for idx, ts in enumerate(timeseries): if len(ts): ts.plot(marker=marker, label=self.nmf.outlets[idx][1], c=self.nmf.outlets[idx][2]) plt.xlabel('Date Published ({})'.format(self.frequency[freq]), fontsize=12) plt.legend(loc='best') else: ts = pd.Series([1], index=df['date_published']).resample(freq).sum().fillna(0) if legend_label: ts.plot(marker=marker, label=label) else: ts.plot(marker=marker) plt.xlabel('Date Published ({})'.format(self.frequency[freq]), fontsize=12) plt.ylabel('Article Count (freq={})'.format(freq), fontsize=12) if topic_num: plt.title('Topic Number {}: {}'.format(topic_num, label)) elif searchterm: plt.title("Articles Containing '{}'".format(searchterm), fontsize=14) elif title: plt.title(title) plt.subplots_adjust(left=0.06, bottom=0.1, right=0.97, top=0.92) # Adjust the date range for the x-axis to allow for two weeks on either side. If year is set to True, show from the beginning to the end of the minimum and maximum years respectively if year == True: xmin = df['date_published'].min() xmin = pd.to_datetime('{}-01-01'.format(xmin.year)) - pd.Timedelta(weeks=2) xmax = df['date_published'].max() xmax = pd.to_datetime('{}-01-01'.format(xmax.year+1)) + pd.Timedelta(weeks=2) plt.xlim((xmin, xmax)) elif year == False: xmin = df['date_published'].min() - pd.Timedelta(weeks=2) xmax = df['date_published'].max() + pd.Timedelta(weeks=2) plt.xlim((xmin, xmax)) def _create_fig(self, figsize=None, watermark=True): ''' Helper function to create a figure and optionally add watermark ''' if figsize == None: figsize = self.figsize fig = plt.figure(figsize=figsize) if watermark: fig.text(0.05, 0.03, 'Author: Erich Wellinger', fontsize=10, alpha=0.7) fig.text(0.33, 0.75, 'github.com/ewellinger/election_analysis', fontsize=20, color='gray', alpha=0.5) return fig def candidate_plots(self, candidate_names, title, byline=None, freq='W', fig=None): ''' candidate_plots plots multiple topics on one plot candidate_topic_idxs: list of str Should be a list of strings identifying which candidates to plot title: str Title for the plot byline: str (default None) Byline to go beneath the title ''' if isinstance(fig, tuple): fig = self._create_fig(fig) elif not fig: fig = self._create_fig() candidate_topic_idxs = [self.candidate_info[name][0] for name in candidate_names] c_list = zip(*self.nmf.outlets)[2] for idx, candidate in enumerate(candidate_topic_idxs): self.article_count_by_time(topic_num=candidate, freq=freq, legend_label=True, fig=fig, year=None) # Make sure the xlims have a buffer on either side of earliest and latest values xmin, xmax = plt.xlim() # Add in a buffer to either and reset the xlims xmin = pd.to_datetime(xmin*7, unit='D') - pd.Timedelta(weeks=2) xmax = pd.to_datetime(xmax*7, unit='D') + pd.Timedelta(weeks=2) plt.xlim((xmin, xmax)) plt.legend(loc='best') plt.subplots_adjust(left=0.05, bottom=0.1, right=0.97) plt.suptitle(title, fontsize=20) if byline: plt.title(byline, fontsize=10) else: plt.title('') def topic_word_cloud(self, topic_num, max_words=200, figsize=None, width=2400, height=1300, ax=None, mask_fname=None, inherit_color=False): ''' Create word cloud for a given topic INPUT: topic_idx: int max_words: int (default 200) Max number of words to encorporate into the word cloud figsize: tuple (int, int) Size of the figure if an axis isn't passed width: int (default 2400) height: int (default 1300) ax: None or matplotlib axis object mask_fname: None or str None if no mask is desired, otherwise a string providing the path the image being used as the mask inherit_color: bool, default False Indicates whether the wordcloud should inherit the colors from the image mask ''' if figsize == None: figsize = self.figsize if mask_fname: mask = np.array(Image.open(mask_fname)) wc = WordCloud(background_color='white', max_words=max_words, mask=mask, width=width, height=height) else: wc = WordCloud(background_color='white', max_words=max_words, width=width, height=height) word_freq = self.nmf.topic_word_frequency(topic_num) # Fit the WordCloud object to the specific topic's word frequencies wc.fit_words(word_freq) # Create the matplotlib figure and axis if they weren't passed in if not ax: fig = plt.figure(figsize=self.figsize) ax = fig.add_subplot(111) if mask_fname and inherit_color: image_colors = ImageColorGenerator(imread(mask_fname)) plt.imshow(wc.recolor(color_func=image_colors)) plt.axis('off') else: ax.imshow(wc) ax.axis('off') def normalized_source_barchart(self, topic_num, ax=None): ''' Make a bar chart relecting the normalized reporting by source INPUT: topic_num: int ax: Matplotlib axis object (default None) If no axis object is provided, one will be created with size (2.5, 5) ''' num_articles = self.nmf.labels[:, topic_num].sum() df = self.df.loc[self.nmf.labels[:, topic_num]] percent_by_source = [float(len(df.loc[df['source'] == outlet])) / num_articles for outlet in zip(*self.nmf.outlets)[0]] normalized = [percent / outlet_size for percent, outlet_size in zip(percent_by_source, self.outlet_sizes)] normalized = [percent / np.sum(normalized) for percent in normalized] if not ax: fig, ax = plt.subplots(1, figsize=(2.5, 5)) for idx, percent in enumerate(normalized): ax.bar(0, percent, width=1, label=self.nmf.outlets[idx][1], color=self.nmf.outlets[idx][2], bottom=np.sum(normalized[:idx])) if percent >= 0.1: ax.text(0.5, np.sum(normalized[:idx]) + 0.5*percent, '{0}: {1:.1f}%'.format(self.nmf.outlets[idx][1], 100*percent), horizontalalignment='center', verticalalignment='center') elif percent >= 0.05: ax.text(0.5, np.sum(normalized[:idx]) + 0.5*percent, '{0}: {1:.1f}%'.format(self.nmf.outlets[idx][1], 100*percent), horizontalalignment='center', verticalalignment='center', fontsize=10) elif percent >= 0.025: ax.text(0.5, np.sum(normalized[:idx]) + 0.5*percent, '{0}: {1:.1f}%'.format(self.nmf.outlets[idx][1], 100*percent), horizontalalignment='center', verticalalignment='center', fontsize=8) plt.axis('off') plt.title('% Reported By Source (Normalized)', fontsize=10) def topic_time_and_cloud(self, topic_num, source=False, year=False, title=None): ''' Creates viz of topic including counts over time, word cloud, and breakdown by source INPUT: topic_num: int Which topic to generate the plot for source: bool (default False) Arg passed to the article_count_by_time function year: bool (default False) Arg passed to the article_count_by_time function title: str (default None) str to use as title, otherwise "Topic Number {topic_num}: {label}" will be used OUTPUT: ax1: Matplotlib axis object Used to modify the article count by time axis (e.g. add vertical lines to signify event of import) ''' fig = self._create_fig(figsize=(14, 8.5), watermark=False) ax1 = fig.add_axes([0.05, 0.5, 0.93, 0.41]) self.article_count_by_time(topic_num=topic_num, source=source, year=year, fig=fig, legend_label=True) ax1.xaxis.labelpad = -4 plt.title('Number of Articles in Topic: {}'.format(self.nmf.labels[:, topic_num].sum()), x=0.4825) if title == None: plt.suptitle('Topic Number {}: {}'.format(topic_num, self.labels.get(topic_num, "Unknown")), fontsize=20) else: plt.suptitle(title, fontsize=20) fig.text(0.05, 0.44, 'Author: Erich Wellinger', fontsize=10, alpha=0.7) fig.text(0.33, 0.8, 'github.com/ewellinger/election_analysis', fontsize=20, color='gray', alpha=0.5) ax2 = fig.add_axes([0.025, 0, 0.79, 0.43]) self.topic_word_cloud(topic_num, ax=ax2, width=1900, height=625) ax3 = fig.add_axes([0.825, 0.01, 0.15555, 0.4]) self.normalized_source_barchart(topic_num, ax=ax3) return ax1 def single_candidate_plot(self, candidate_last_name): ''' Returns plot associated with a particular candidate Calls ElectionPlotting.topic_time_and_cloud() for the relevant topic_num and adds lines to mark announcement and dropout dates ''' topic_num, announce_date, drop_date = self.candidate_info[candidate_last_name.lower()] ax = self.topic_time_and_cloud(topic_num) if announce_date: ax.axvline(x=pd.to_datetime(announce_date), label='Announcement Date', c='#55a868', lw=3, alpha=0.8) if drop_date: ax.axvline(x=pd.to_datetime(drop_date), label='Withdrawl Date', c='#c44e52', lw=3, alpha=0.8) ax.legend(loc='best')
import pandas as pd import numpy as np from NMF_Clustering import NMFClustering from sklearn.decomposition import PCA def make_skree_plot(pca, n_components, x=None, save=False): fig = plt.figure(figsize=(12, 8)) plt.plot(range(n_components), pca.explained_variance_ratio_[:n_components]) plt.xlabel('Number of Components') plt.ylabel('% Explained Variance') if x: plt.axvline(x=x, color='r', linestyle='--') plt.title('Scree Plot') plt.subplots_adjust(left=0.08, bottom=0.08, right=0.96, top=0.93) if save: plt.savefig('./plots/scree_plot.png', dpi=300) plt.close() if __name__ == '__main__': df = pd.read_pickle('election_data.pkl') nmf = NMFClustering(300) nmf.fit_tfidf(df) pca = PCA() pca.fit(nmf.tfidf_matrix) make_skree_plot(pca, 300, x=90, save=True)
class ElectionPlotting(object): def __init__(self, df, nmf=None, num_topics=None, figsize=(14, 8)): ''' init docstring INPUT: df: nmf: num_topics: Explain how it works if you don't pass an nmf object ''' self.df = df if isinstance(nmf, NMFClustering): # Check to make sure that the object has been fit to the data if not hasattr(nmf, 'W_matrix'): nmf.fit(df) self.nmf = nmf elif num_topics: self.nmf = NMFClustering(num_topics) self.nmf.fit(df) else: raise ValueError( "You must either supply a NMFClustering object or specify the number of topics!" ) self.labels = get_topic_labels() self.outlet_sizes = [ len(df.loc[df['source'] == outlet]) for outlet in zip(*self.nmf.outlets)[0] ] self.candidate_info = get_candidate_info() self.frequency = {'D': 'Daily', 'W': 'Weekly', 'M': 'Monthly'} self.figsize = figsize def article_count_by_time(self, topic_num=None, searchterm=None, source=False, freq='W', normalize=False, marker='o', year=False, fig=None, title=None, legend_label=None): ''' Creates a plot of articles over time by count or frequency INPUT: topic_num: int (default None) searchterm: str (default None) source: bool (default False) freq: 'D', 'W', or 'M' (default 'W') normalize: bool (default False) marker: str (default 'o') year: bool (default False) Indicates whether to set xlims to encompass the entirety of the year fig: Matplotlib figure object or tuple (default None) If figure object, plot will be added to the existing figure. If tuple, figure will be created using specified size. If None, figure will be created using self.figsize. title: str (default None) legend_label: bool (default None) ''' if isinstance(fig, tuple): fig = self._create_fig(fig) elif not fig: fig = self._create_fig() # If a specific topic was given set the label and subset the dataframe if topic_num: label = self.labels.get(topic_num, 'Unknown') df = self.df.loc[self.nmf.labels[:, topic_num]] else: df = self.df # Subset the dataframe if a searchterm is provided if searchterm: df = df.loc[df['lemmatized_text'].str.contains(searchterm)] # If source is set, split up into a line for each news source if source: timeseries = [ pd.Series([1], index=df.loc[df['source'] == outlet, 'date_published']).resample( freq).sum().fillna(0) for outlet in zip(*self.nmf.outlets)[0] ] if normalize: timeseries = [ ts / outlet_size for ts, outlet_size in zip(timeseries, self.outlet_sizes) ] plt.ylabel('Article Frequency (freq = {})'.format(freq), fontsize=12) else: plt.ylabel('Article Count (Freq = {})'.format(freq), fontsize=12) # plt.subplots_adjust(left=0.08, bottom=0.12, right=0.95, top=0.92) for idx, ts in enumerate(timeseries): if len(ts): ts.plot(marker=marker, label=self.nmf.outlets[idx][1], c=self.nmf.outlets[idx][2]) plt.xlabel('Date Published ({})'.format(self.frequency[freq]), fontsize=12) plt.legend(loc='best') else: ts = pd.Series( [1], index=df['date_published']).resample(freq).sum().fillna(0) if legend_label: ts.plot(marker=marker, label=label) else: ts.plot(marker=marker) plt.xlabel('Date Published ({})'.format(self.frequency[freq]), fontsize=12) plt.ylabel('Article Count (freq={})'.format(freq), fontsize=12) if topic_num: plt.title('Topic Number {}: {}'.format(topic_num, label)) elif searchterm: plt.title("Articles Containing '{}'".format(searchterm), fontsize=14) elif title: plt.title(title) plt.subplots_adjust(left=0.06, bottom=0.1, right=0.97, top=0.92) # Adjust the date range for the x-axis to allow for two weeks on either side. If year is set to True, show from the beginning to the end of the minimum and maximum years respectively if year == True: xmin = df['date_published'].min() xmin = pd.to_datetime('{}-01-01'.format( xmin.year)) - pd.Timedelta(weeks=2) xmax = df['date_published'].max() xmax = pd.to_datetime( '{}-01-01'.format(xmax.year + 1)) + pd.Timedelta(weeks=2) plt.xlim((xmin, xmax)) elif year == False: xmin = df['date_published'].min() - pd.Timedelta(weeks=2) xmax = df['date_published'].max() + pd.Timedelta(weeks=2) plt.xlim((xmin, xmax)) def _create_fig(self, figsize=None, watermark=True): ''' Helper function to create a figure and optionally add watermark ''' if figsize == None: figsize = self.figsize fig = plt.figure(figsize=figsize) if watermark: fig.text(0.05, 0.03, 'Author: Erich Wellinger', fontsize=10, alpha=0.7) fig.text(0.33, 0.75, 'github.com/ewellinger/election_analysis', fontsize=20, color='gray', alpha=0.5) return fig def candidate_plots(self, candidate_names, title, byline=None, freq='W', fig=None): ''' candidate_plots plots multiple topics on one plot candidate_topic_idxs: list of str Should be a list of strings identifying which candidates to plot title: str Title for the plot byline: str (default None) Byline to go beneath the title ''' if isinstance(fig, tuple): fig = self._create_fig(fig) elif not fig: fig = self._create_fig() candidate_topic_idxs = [ self.candidate_info[name][0] for name in candidate_names ] c_list = zip(*self.nmf.outlets)[2] for idx, candidate in enumerate(candidate_topic_idxs): self.article_count_by_time(topic_num=candidate, freq=freq, legend_label=True, fig=fig, year=None) # Make sure the xlims have a buffer on either side of earliest and latest values xmin, xmax = plt.xlim() # Add in a buffer to either and reset the xlims xmin = pd.to_datetime(xmin * 7, unit='D') - pd.Timedelta(weeks=2) xmax = pd.to_datetime(xmax * 7, unit='D') + pd.Timedelta(weeks=2) plt.xlim((xmin, xmax)) plt.legend(loc='best') plt.subplots_adjust(left=0.05, bottom=0.1, right=0.97) plt.suptitle(title, fontsize=20) if byline: plt.title(byline, fontsize=10) else: plt.title('') def topic_word_cloud(self, topic_num, max_words=200, figsize=None, width=2400, height=1300, ax=None, mask_fname=None, inherit_color=False): ''' Create word cloud for a given topic INPUT: topic_idx: int max_words: int (default 200) Max number of words to encorporate into the word cloud figsize: tuple (int, int) Size of the figure if an axis isn't passed width: int (default 2400) height: int (default 1300) ax: None or matplotlib axis object mask_fname: None or str None if no mask is desired, otherwise a string providing the path the image being used as the mask inherit_color: bool, default False Indicates whether the wordcloud should inherit the colors from the image mask ''' if figsize == None: figsize = self.figsize if mask_fname: mask = np.array(Image.open(mask_fname)) wc = WordCloud(background_color='white', max_words=max_words, mask=mask, width=width, height=height) else: wc = WordCloud(background_color='white', max_words=max_words, width=width, height=height) word_freq = self.nmf.topic_word_frequency(topic_num) # Fit the WordCloud object to the specific topic's word frequencies wc.fit_words(word_freq) # Create the matplotlib figure and axis if they weren't passed in if not ax: fig = plt.figure(figsize=self.figsize) ax = fig.add_subplot(111) if mask_fname and inherit_color: image_colors = ImageColorGenerator(imread(mask_fname)) plt.imshow(wc.recolor(color_func=image_colors)) plt.axis('off') else: ax.imshow(wc) ax.axis('off') def normalized_source_barchart(self, topic_num, ax=None): ''' Make a bar chart relecting the normalized reporting by source INPUT: topic_num: int ax: Matplotlib axis object (default None) If no axis object is provided, one will be created with size (2.5, 5) ''' num_articles = self.nmf.labels[:, topic_num].sum() df = self.df.loc[self.nmf.labels[:, topic_num]] percent_by_source = [ float(len(df.loc[df['source'] == outlet])) / num_articles for outlet in zip(*self.nmf.outlets)[0] ] normalized = [ percent / outlet_size for percent, outlet_size in zip( percent_by_source, self.outlet_sizes) ] normalized = [percent / np.sum(normalized) for percent in normalized] if not ax: fig, ax = plt.subplots(1, figsize=(2.5, 5)) for idx, percent in enumerate(normalized): ax.bar(0, percent, width=1, label=self.nmf.outlets[idx][1], color=self.nmf.outlets[idx][2], bottom=np.sum(normalized[:idx])) if percent >= 0.1: ax.text(0.5, np.sum(normalized[:idx]) + 0.5 * percent, '{0}: {1:.1f}%'.format(self.nmf.outlets[idx][1], 100 * percent), horizontalalignment='center', verticalalignment='center') elif percent >= 0.05: ax.text(0.5, np.sum(normalized[:idx]) + 0.5 * percent, '{0}: {1:.1f}%'.format(self.nmf.outlets[idx][1], 100 * percent), horizontalalignment='center', verticalalignment='center', fontsize=10) elif percent >= 0.025: ax.text(0.5, np.sum(normalized[:idx]) + 0.5 * percent, '{0}: {1:.1f}%'.format(self.nmf.outlets[idx][1], 100 * percent), horizontalalignment='center', verticalalignment='center', fontsize=8) plt.axis('off') plt.title('% Reported By Source (Normalized)', fontsize=10) def topic_time_and_cloud(self, topic_num, source=False, year=False, title=None): ''' Creates viz of topic including counts over time, word cloud, and breakdown by source INPUT: topic_num: int Which topic to generate the plot for source: bool (default False) Arg passed to the article_count_by_time function year: bool (default False) Arg passed to the article_count_by_time function title: str (default None) str to use as title, otherwise "Topic Number {topic_num}: {label}" will be used OUTPUT: ax1: Matplotlib axis object Used to modify the article count by time axis (e.g. add vertical lines to signify event of import) ''' fig = self._create_fig(figsize=(14, 8.5), watermark=False) ax1 = fig.add_axes([0.05, 0.5, 0.93, 0.41]) self.article_count_by_time(topic_num=topic_num, source=source, year=year, fig=fig, legend_label=True) ax1.xaxis.labelpad = -4 plt.title('Number of Articles in Topic: {}'.format( self.nmf.labels[:, topic_num].sum()), x=0.4825) if title == None: plt.suptitle('Topic Number {}: {}'.format( topic_num, self.labels.get(topic_num, "Unknown")), fontsize=20) else: plt.suptitle(title, fontsize=20) fig.text(0.05, 0.44, 'Author: Erich Wellinger', fontsize=10, alpha=0.7) fig.text(0.33, 0.8, 'github.com/ewellinger/election_analysis', fontsize=20, color='gray', alpha=0.5) ax2 = fig.add_axes([0.025, 0, 0.79, 0.43]) self.topic_word_cloud(topic_num, ax=ax2, width=1900, height=625) ax3 = fig.add_axes([0.825, 0.01, 0.15555, 0.4]) self.normalized_source_barchart(topic_num, ax=ax3) return ax1 def single_candidate_plot(self, candidate_last_name): ''' Returns plot associated with a particular candidate Calls ElectionPlotting.topic_time_and_cloud() for the relevant topic_num and adds lines to mark announcement and dropout dates ''' topic_num, announce_date, drop_date = self.candidate_info[ candidate_last_name.lower()] ax = self.topic_time_and_cloud(topic_num) if announce_date: ax.axvline(x=pd.to_datetime(announce_date), label='Announcement Date', c='#55a868', lw=3, alpha=0.8) if drop_date: ax.axvline(x=pd.to_datetime(drop_date), label='Withdrawl Date', c='#c44e52', lw=3, alpha=0.8) ax.legend(loc='best')
color_cycle = cycler( color=['#4c72b0', '#55a868', '#c44e52', '#8172b2', '#ccb974', '#64b5cd']) matplotlib.rcParams['axes.prop_cycle'] = color_cycle matplotlib.rcParams['lines.linewidth'] = 1.75 def make_skree_plot(pca, n_components, x=None, save=False): fig = plt.figure(figsize=(12, 8)) plt.plot(range(n_components), pca.explained_variance_ratio_[:n_components]) plt.xlabel('Number of Components') plt.ylabel('% Explained Variance') if x: plt.axvline(x=x, color='r', linestyle='--') plt.title('Scree Plot') plt.subplots_adjust(left=0.08, bottom=0.08, right=0.96, top=0.93) if save: plt.savefig('./plots/scree_plot.png', dpi=300) plt.close() if __name__ == '__main__': df = pd.read_pickle('election_data.pkl') nmf = NMFClustering(-1) nmf.fit_tfidf(df) pca = PCA() pca.fit(nmf.tfidf_matrix) make_skree_plot(pca, 350, x=250, save=True)
import pandas as pd import numpy as np from NMF_Clustering import NMFClustering from sklearn.decomposition import PCA def make_skree_plot(pca, n_components, x=None, save=False): fig = plt.figure(figsize=(12, 8)) plt.plot(range(n_components), pca.explained_variance_ratio_[:n_components]) plt.xlabel('Number of Components') plt.ylabel('% Explained Variance') if x: plt.axvline(x=x, color='r', linestyle='--') plt.title('Scree Plot') plt.subplots_adjust(left=0.08, bottom=0.08, right=0.96, top=0.93) if save: plt.savefig('./plots/scree_plot.png', dpi=300) plt.close() if __name__=='__main__': df = pd.read_pickle('election_data.pkl') nmf = NMFClustering(300) nmf.fit_tfidf(df) pca = PCA() pca.fit(nmf.tfidf_matrix) make_skree_plot(pca, 300, x=90, save=True)