def _update_plot(self, axis, view): if self.plot_type == 'regplot': sns.regplot(x=view.x, y=view.y, data=view.data, ax=axis, **self.style) elif self.plot_type == 'boxplot': self.style.pop('return_type', None) self.style.pop('figsize', None) sns.boxplot(view.data[view.y], view.data[view.x], ax=axis, **self.style) elif self.plot_type == 'violinplot': sns.violinplot(view.data[view.y], view.data[view.x], ax=axis, **self.style) elif self.plot_type == 'interact': sns.interactplot(view.x, view.x2, view.y, data=view.data, ax=axis, **self.style) elif self.plot_type == 'corrplot': sns.corrplot(view.data, ax=axis, **self.style) elif self.plot_type == 'lmplot': sns.lmplot(x=view.x, y=view.y, data=view.data, ax=axis, **self.style) elif self.plot_type in ['pairplot', 'pairgrid', 'facetgrid']: map_opts = [(k, self.style.pop(k)) for k in self.style.keys() if 'map' in k] if self.plot_type == 'pairplot': g = sns.pairplot(view.data, **self.style) elif self.plot_type == 'pairgrid': g = sns.PairGrid(view.data, **self.style) elif self.plot_type == 'facetgrid': g = sns.FacetGrid(view.data, **self.style) for opt, args in map_opts: plot_fn = getattr(sns, args[0]) if hasattr( sns, args[0]) else getattr(plt, args[0]) getattr(g, opt)(plot_fn, *args[1:]) plt.close(self.handles['fig']) self.handles['fig'] = plt.gcf() else: super(SNSFramePlot, self)._update_plot(axis, view)
def _update_plot(self, axis, view): style = self._process_style(self.style[self.cyclic_index]) if self.plot_type == 'factorplot': opts = dict(style, **({'hue': view.x2} if view.x2 else {})) sns.factorplot(x=view.x, y=view.y, data=view.data, **opts) elif self.plot_type == 'regplot': sns.regplot(x=view.x, y=view.y, data=view.data, ax=axis, **style) elif self.plot_type == 'boxplot': style.pop('return_type', None) style.pop('figsize', None) sns.boxplot(view.data[view.y], view.data[view.x], ax=axis, **style) elif self.plot_type == 'violinplot': if view.x: sns.violinplot(view.data[view.y], view.data[view.x], ax=axis, **style) else: sns.violinplot(view.data, ax=axis, **style) elif self.plot_type == 'interact': sns.interactplot(view.x, view.x2, view.y, data=view.data, ax=axis, **style) elif self.plot_type == 'corrplot': sns.corrplot(view.data, ax=axis, **style) elif self.plot_type == 'lmplot': sns.lmplot(x=view.x, y=view.y, data=view.data, ax=axis, **style) elif self.plot_type in ['pairplot', 'pairgrid', 'facetgrid']: style_keys = list(style.keys()) map_opts = [(k, style.pop(k)) for k in style_keys if 'map' in k] if self.plot_type == 'pairplot': g = sns.pairplot(view.data, **style) elif self.plot_type == 'pairgrid': g = sns.PairGrid(view.data, **style) elif self.plot_type == 'facetgrid': g = sns.FacetGrid(view.data, **style) for opt, args in map_opts: plot_fn = getattr(sns, args[0]) if hasattr( sns, args[0]) else getattr(plt, args[0]) getattr(g, opt)(plot_fn, *args[1:]) plt.close(self.handles['fig']) self.handles['fig'] = plt.gcf() else: super(SNSFramePlot, self)._update_plot(axis, view)
def _update_plot(self, axis, view): style = self._process_style(self.style[self.cyclic_index]) if self.plot_type == 'factorplot': opts = dict(style, **({'hue': view.x2} if view.x2 else {})) sns.factorplot(x=view.x, y=view.y, data=view.data, **opts) elif self.plot_type == 'regplot': sns.regplot(x=view.x, y=view.y, data=view.data, ax=axis, **style) elif self.plot_type == 'boxplot': style.pop('return_type', None) style.pop('figsize', None) sns.boxplot(view.data[view.y], view.data[view.x], ax=axis, **style) elif self.plot_type == 'violinplot': if view.x: sns.violinplot(view.data[view.y], view.data[view.x], ax=axis, **style) else: sns.violinplot(view.data, ax=axis, **style) elif self.plot_type == 'interact': sns.interactplot(view.x, view.x2, view.y, data=view.data, ax=axis, **style) elif self.plot_type == 'corrplot': sns.corrplot(view.data, ax=axis, **style) elif self.plot_type == 'lmplot': sns.lmplot(x=view.x, y=view.y, data=view.data, ax=axis, **style) elif self.plot_type in ['pairplot', 'pairgrid', 'facetgrid']: style_keys = list(style.keys()) map_opts = [(k, style.pop(k)) for k in style_keys if 'map' in k] if self.plot_type == 'pairplot': g = sns.pairplot(view.data, **style) elif self.plot_type == 'pairgrid': g = sns.PairGrid(view.data, **style) elif self.plot_type == 'facetgrid': g = sns.FacetGrid(view.data, **style) for opt, args in map_opts: plot_fn = getattr(sns, args[0]) if hasattr(sns, args[0]) else getattr(plt, args[0]) getattr(g, opt)(plot_fn, *args[1:]) if self._close_figures: plt.close(self.handles['fig']) self.handles['fig'] = plt.gcf() else: super(SNSFramePlot, self)._update_plot(axis, view)
def plot_synapse_2d(X, zh, output='synapse_cluster_2d.pdf'): pca = PCA(n_components=2) X_new = pca.fit_transform(X) d = { #r'$x_1$': X_new[:,0]/(10**5), r'$x_1$': X_new[:,0], #r'$x_2$': X_new[:,1]/(10**6), r'$x_2$': X_new[:,1], r'$\mathcal{C}_j$': [int(z+1) for z in zh] } df = pd.DataFrame(data=d) g = sns.lmplot(r'$x_1$', r'$x_2$', data=df, hue=r'$\mathcal{C}_j$', fit_reg=False, scatter=True, scatter_kws={"s":4}) g.set(xlabel=r'$x_1$') g.set(ylabel=r'$x_2$') #g.set(xlabel=r'$x_1$~$(\times 10^6)$') #g.set(ylabel=r'$x_2$~$(\times 10^5)$') g.savefig(output)
def plot_synapse_2d(X, zh, output='synapse_cluster_2d.pdf'): pca = PCA(n_components=2) X_new = pca.fit_transform(X) d = { #r'$x_1$': X_new[:,0]/(10**5), r'$x_1$': X_new[:, 0], #r'$x_2$': X_new[:,1]/(10**6), r'$x_2$': X_new[:, 1], r'$\mathcal{C}_j$': [int(z + 1) for z in zh] } df = pd.DataFrame(data=d) g = sns.lmplot(r'$x_1$', r'$x_2$', data=df, hue=r'$\mathcal{C}_j$', fit_reg=False, scatter=True, scatter_kws={"s": 4}) g.set(xlabel=r'$x_1$') g.set(ylabel=r'$x_2$') #g.set(xlabel=r'$x_1$~$(\times 10^6)$') #g.set(ylabel=r'$x_2$~$(\times 10^5)$') g.savefig(output)
def lmplot(self, x=None, y=None, hue=None, data=None, *args, **kwargs): """ Plot data and regression model fits Parameters ---------- x : a list of names of variable in data that need to visualize \ their distribution y : a list of names of variable in data that need to visualize \ its joint distribution against every x above hue : the name of a variable in data that provides labels for each \ category data : pandas dataframe **kwargs : other arguments in seaborn.jointplot palette : palette name, list, or dict, optional col_wrap : int, optional size : scalar, optional aspect : scalar, optional markers : matplotlib marker code or list of marker codes, optional share{x,y} : bool, optional legend : bool, optional legend_out : bool, optional x_estimator : callable that maps vector -> scalar, optional x_bins : int or vector, optional x_ci : 'ci', 'sd', int in [0, 100] or None, optional scatter : bool, optional fit_reg : bool, optional ci : int in [0, 100] or None, optional n_boot : int, optional units : variable name in data, optional order : int, optional logistic : bool, optional lowess : bool, optional robust : bool, optional logx : bool, optional {x,y}_partial : strings in data or matrices truncate : bool, optional {x,y}_jitter : floats, optional {scatter,line}_kws : dictionaries Returns ------- JointGrid object with the plot on it References ---------- Seaborn lmplot further documentation https://seaborn.pydata.org/generated/seaborn.seaborn.lmplot """ # check data if not isinstance(data, (pd.DataFrame)): raise ValueError('data must be pandas dataframe') # check x and y if x is None: raise ValueError('x can NOT be None') else: # x is NOT None if not isinstance(x, (list, tuple, np.ndarray, pd.Index)): x = [x] if y is None: raise ValueError('y can NOT be None') else: # y is NOT None if not isinstance(y, (list, tuple, np.ndarray, pd.Index)): y = [y] if hue is not None: if hue not in data.columns.values: raise ValueError('{} is NOT in data'.format(hue)) # no figure configuration needed plt.close() # iterate thru x for i, col_y in enumerate(y): if col_y not in data.columns.values: raise ValueError('{} is NOT in data'.format(col_y)) b = data[col_y] b_not_nan = np.ones(b.shape[0], dtype=np.bool) if np.logical_not(np.isfinite(b)).any(): logger.warning('RUNTIME WARNING: {} column has inf or nan ' ''.format(col_y)) b = b.replace([-np.inf, np.inf], np.nan) # filter b_not_nan = np.logical_not(b.isnull()) for j, col_x in enumerate(x): # check if col in data if col_x not in data.columns.values: raise ValueError('{} is NOT in data'.format(col_x)) a = data[col_x] a_not_nan = np.ones(a.shape[0], dtype=np.bool) if np.logical_not(np.isfinite(a)).any(): logger.warning('RUNTIME WARNING: {} column has inf or ' 'nan'.format(col_x)) a = a.replace([-np.inf, np.inf], np.nan) # filter a_not_nan = np.logical_not(a.isnull()) # joint filter not_nan = b_not_nan & a_not_nan joint_grid = sns.lmplot( x=col_x, y=col_y, data=data.loc[not_nan, :], hue=hue, legend=True, legend_out=False, # size=self.size[0], *args, **kwargs) joint_grid.fig.axes[0].set_title(label='Reg Fit of {} on {} ' ''.format(col_y, col_x), fontsize=self.title_fontsize) joint_grid.fig.axes[0].set_xlabel(xlabel=col_x, fontsize=self.label_fontsize) joint_grid.fig.axes[0].set_ylabel(ylabel=col_y, fontsize=self.label_fontsize) joint_grid.fig.axes[0].tick_params( axis='both', which='maj', labelsize=self.tick_fontsize) joint_grid.fig.axes[0].legend(loc='upper right') joint_grid.fig.subplots_adjust(wspace=0.5, hspace=0.3, left=0.125, right=0.9, top=0.9, bottom=0.1) joint_grid.fig.tight_layout() plt.show()
# With red crosses. plt.figure(2) plt.plot(CarData['hwy'], CarData['cty'], 'r+') plt.xlabel('hwy') plt.ylabel('cty') plt.title('Scatter plot of hwy vs cty') plt.grid(True) plt.show() # You can also change the size of the points depending on a variable # E.g. if you want to display more common brands in terms of observations # in a larger way, use "size": CarData['counts'] = CarData.groupby(['make'])['make'].transform('count') plt.figure(3) plt.scatter(CarData['hwy'], CarData['cty'], marker='o', c='r', s=CarData['counts']) plt.xlabel('hwy') plt.ylabel('cty') plt.title('Scatter plot of hwy vs cty') plt.grid(True) plt.show() # We use sns.regplot or sns.lmplot also sns.regplot(x='cty', y='hwy', marker="+", ci=95, data=CarData) sns.lmplot(x="cty", y="hwy", marker="o", ci=95, data=CarData)
plt.pie(peace_age, labels=peace_age.index, autopct='%1.1f%%') plt.show() sns.jointplot(x="Year", y="Age", kind='reg', data=data) plt.show() sns.boxplot(data=data, x='Category', y='Age') plt.show() sns.lmplot('Year','Age',data=data,lowess=True, aspect=2, line_kws={'color' : 'black'}) plt.show() # Question 2: What words are most frequently written in the prize motivation? top_N = 10 stopwords = nltk.corpus.stopwords.words('english') re_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords)) words = (data['Motivation'] .str.lower() .replace([r'\|', re_stopwords], [' ', ' '], regex=True) .str.cat(sep=' ') .split() )
def plot(self, show_samples, show_loadings, sbrn_plt): # Normalizer and Delta perform badly # They flatten out all difference in a PCA plot pca = PCA(n_components=self.n_components) X_bar = pca.fit_transform(self.X) var_exp = pca.explained_variance_ratio_ var_pc1 = np.round(var_exp[0]*100, decimals=2) var_pc2 = np.round(var_exp[1]*100, decimals=2) explained_variance = np.round(sum(pca.explained_variance_ratio_)*100, decimals=2) comps = pca.components_ comps = comps.transpose() loadings = pca.components_.transpose() vocab_weights_p1 = sorted(zip(self.features, comps[:,0]), key=lambda tup: tup[1], reverse=True) vocab_weights_p2 = sorted(zip(self.features, comps[:,1]), key=lambda tup: tup[1], reverse=True) if sbrn_plt == False: # Generate color dictionary color_dict = {author:index for index, author in enumerate(sorted(set(self.authors)))} cmap = discrete_cmap(len(color_dict), base_cmap='brg') if show_samples == True: fig = plt.figure(figsize=(8,6)) ax = fig.add_subplot(111) x1, x2 = X_bar[:,0], X_bar[:,1] # If anything needs to be invisible in plot, add to exclusion_list ax.scatter(x1, x2, 100, edgecolors='none', facecolors='none', cmap='rainbow') for index, (p1, p2, a, title) in enumerate(zip(x1, x2, self.authors, self.titles)): ax.scatter(p1, p2, marker='o', color=cmap(color_dict[a]), s=20) ax.text(p1, p2, title.split('_')[-1], color='black', fontdict={'size': 5}) # Legend settings (code for making a legend) collected_patches = [] for author in set(self.authors): legend_patch = mpatches.Patch(color=cmap(color_dict[author]), label=author.split('-')[0]) collected_patches.append(legend_patch) plt.legend(handles=collected_patches, fontsize=7) ax.set_xlabel('Principal Component 1 \n \n Explained Variance: {}% \n Sample Size: {} words/sample \n Number of Features: {} features'.format(str(explained_variance), str(self.sample_size), str(len(self.features))), fontdict={'size': 7}) ax.set_ylabel('Principal Component 2', fontdict={'size': 7}) if show_loadings == True: ax2 = ax.twinx().twiny() l1, l2 = loadings[:,0], loadings[:,1] ax2.scatter(l1, l2, 100, edgecolors='none', facecolors='none'); for x, y, l in zip(l1, l2, self.features): ax2.text(x, y, l, ha='center', va="center", color="black", fontdict={'family': 'Arial', 'size': 6}) # Align axes # Important to adjust margins first when function words fall outside plot # This is due to the axes aligning (def align). ax2.margins(x=0.14, y=0.14) align_xaxis(ax, 0, ax2, 0) align_yaxis(ax, 0, ax2, 0) plt.axhline(y=0, ls="--", lw=0.5, c='0.75') plt.axvline(x=0, ls="--", lw=0.5, c='0.75') plt.tight_layout() plt.show() elif show_loadings == False: plt.axhline(y=0, ls="--", lw=0.5, c='0.75') plt.axvline(x=0, ls="--", lw=0.5, c='0.75') plt.tight_layout() plt.show() # Converting PDF to PNG, use pdftoppm in terminal and -rx -ry for resolution settings fig.savefig(os.path.dirname(os.getcwd()) + "/pca.pdf", transparent=True, format='pdf') elif show_samples == False: fig = plt.figure(figsize=(8, 6)) ax2 = fig.add_subplot(111) l1, l2 = loadings[:,0], loadings[:,1] ax2.scatter(l1, l2, 100, edgecolors='none', facecolors='none') for x, y, l in zip(l1, l2, features): ax2.text(x, y, l, ha='center', va='center', color='black', fontdict={'family': 'Arial', 'size': 6}) ax2.set_xlabel('PC1') ax2.set_ylabel('PC2') align_xaxis(ax, 0, ax2, 0) align_yaxis(ax, 0, ax2, 0) plt.axhline(y=0, ls="--", lw=0.5, c='0.75') plt.axvline(x=0, ls="--", lw=0.5, c='0.75') plt.tight_layout() plt.show() fig.savefig(os.path.dirname(os.getcwd()) + "/pca.pdf", bbox_inches='tight', transparent=True, format='pdf') # Converting PDF to PNG, use pdftoppm in terminal and -rx -ry for resolution settings else: data = [(title.split("_")[0], author, pc1, pc2) for [pc1, pc2], title, author in zip(X_bar, self.titles, self.authors)] df = pd.DataFrame(data, columns=['title', 'author', 'PC1', 'PC2']) # Get the x in an array sns.set_style('darkgrid') sns_plot = sns.lmplot('PC1', 'PC2', data=df, fit_reg=False, hue="author", scatter_kws={"marker": "+","s": 100}, markers='o', legend=False) plt.legend(loc='upper right') plt.tight_layout() plt.show() sns_plot.savefig(os.path.dirname(os.getcwd()) + "/pca.pdf")