def graph_histogram_classification(dataset, round_id, y, part='eval'): """ generate the histogram of predictions :param dataset: dataset object :param round_id: id of the round (model) :param y: prediction values :param part: set (eval / train set) :return: None """ try: for dark, theme in [(True, 'dark_background'), (False, 'seaborn-whitegrid')]: with plt.style.context(theme, after_reset=True): plt.figure(figsize=(6, 6)) for i, name in enumerate(dataset.y_class_names): sns.distplot(y[:, i], hist=False, label=name) plt.title('histogram of probabilities (%s set)' % part) plt.xlabel('values') plt.ylabel('frequencies') plt.legend() __save_fig(dataset.dataset_id, 'hist_%s_%s' % (part, round_id), dark) except: log.error( 'error in graph_histogram_classification with dataset_id %s' % dataset.dataset_id)
def _update_plot(self, axis, view): kwargs = self.style[self.cyclic_index] label = view.label if self.overlaid >= 1 else '' if label: kwargs['label'] = label if self.invert_axes: kwargs['vertical'] = True sns.distplot(view.dimension_values(0), ax=axis, **kwargs)
def plot_angle(data, N=50, title=None, ax1=None, ax2=None, color=None, wrap=True): if ax1 is None or ax2 is None: gs = gridspec.GridSpec(2, 6) ax1 = plt.subplot(gs[:1, :2], polar=True) ax2 = plt.subplot(gs[:1, 2:]) if wrap: vf = np.vectorize(wrapAngle) else: vf = np.vectorize(constrainAngle) x = vf(data) sns.distplot(x, bins=N, ax=ax2, color=color, kde=True) radii, theta = np.histogram(x, bins=N, normed=True) ax1.set_yticklabels([]) if wrap: ax1ticks = [0, 45, 90, 135, 180, -135, -90, -45] ax2ticks = list(range(-180, 180 + 45, 45)) ax1.set_xticklabels(['{}°'.format(x) for x in ax1ticks]) ax2.set_xlim(-180, 180) ax2.set_xticks(ax2ticks) ax2.set_xticklabels(['{}°'.format(x) for x in ax2ticks]) else: ax2ticks = list(range(0, 360 + 45, 45)) ax2.set_xlim(0, 360) ax2.set_xticks(ax2ticks) ax2.set_xticklabels(['{}°'.format(x) for x in ax2ticks]) ax2.set_yticks([]) ax2.set(xlabel='Angle', ylabel='Density') sns.despine(ax=ax2) width = (2 * np.pi) / N ax1.bar(np.deg2rad(theta[1:]), radii, width=width, color=color, alpha=.5) if title is not None: plt.suptitle(title) plt.tight_layout() f = plt.gcf() return f, (ax1, ax2)
def graph_histogram(dataset_id, col, is_categorical, values, part='train'): """ generate the histogram of column col of the dataset :param dataset_id: dataset id :param col: column name :param is_categorical: is the column categorical :param values: values of the column :param part: set (train, test) :return: None """ try: for dark, theme in [(True, 'dark_background'), (False, 'seaborn-whitegrid')]: with plt.style.context(theme, after_reset=True): plt.figure(figsize=(7, 7)) if is_categorical: df = pd.DataFrame(values) df.columns = ['y'] encoder = LabelEncoder() df['y'] = encoder.fit_transform(df['y']) values = df['y'].values sns.distplot(values, kde=False) x_labels = encoder.inverse_transform( list(range(max(values) + 1))) plt.xticks(list(range(max(values) + 1)), x_labels, rotation=90) else: sns.distplot(values) plt.title('distribution of %s (%s set)' % (col, part)) plt.xlabel('values') plt.ylabel('frequencies') __save_fig(dataset_id, '_hist_%s_%s' % (part, col), dark) except: log.error('error in graph_histogram with dataset_id %s' % dataset_id)
def distplot(self, x=None, data=None, *args, **kwargs): """ Flexibly plot a univariate distribution of observations Parameters ---------- x : list of str, input variables; these should be column names in data data : pandas dataframe **kwargs : other arguments in seaborn.distplot bins : argument for matplotlib hist(), or None, optional hist : bool, optional whether to plot a (normed) histogram kde : bool, optional, whether to plot a gaussian kernel \ density estimate rug : bool, optional whether to draw a rugplot on the support axis fit : random variable object, optional color : matplotlib color, optional vertical : bool, optional norm_hist : bool, optional axlabel : string, False, or None, optional label : string, optional Returns ------- figure : matplotlib figure with multiple axes References ---------- Seaborn distplot further documentation https://seaborn.pydata.org/generated/seaborn.distplot.html """ # check data if not isinstance(data, (pd.DataFrame)): raise ValueError('data must be pandas dataframe') # handle single string if not isinstance(x, (list, tuple, np.ndarray, pd.Index)): x = [x] # create fig and axes nrows = len(x) plt.close() fig, axes = plt.subplots(nrows=nrows, ncols=1, sharex=self.sharex, figsize=(self.size[0], nrows * self.size[1])) # HACK: handle Axes indexing when only one ax in fig if nrows == 1: axes = [axes] # iterate thru x for i, col in enumerate(x): # check if col in data if col not in data.columns.values: raise ValueError('{} is NOT in data'.format(col)) a = data[col] if np.logical_not(np.isfinite(a)).any(): logger.warning('RUNTIME WARNING: {} column has inf or nan ' ''.format(col)) a = a.replace([-np.inf, np.inf], np.nan).dropna() sns.distplot(a=a, ax=axes[i], *args, **kwargs) axes[i].set_title( label='Univariate Distribution of {}'.format(col), fontsize=self.title_fontsize) axes[i].set_xlabel(xlabel=col, fontsize=self.label_fontsize) axes[i].set_ylabel(ylabel='percentage (%)', fontsize=self.label_fontsize) axes[i].tick_params(axis='both', which='maj', labelsize=self.tick_fontsize) fig.subplots_adjust(wspace=0.5, hspace=0.3, left=0.125, right=0.9, top=0.9, bottom=0.1) fig.tight_layout() plt.show() return axes
def _update_plot(self, axis, view): label = view.label if self.overlaid == 1 else '' sns.distplot(view.data, ax=axis, label=label, **self.style)
def _update_plot(self, axis, view): sns.distplot(view.data, ax=axis, label=' ', **self.style)
def plot_angle(data, N=50, title=None, ax1=None, ax2=None, color=None, wrap=True): """ Plot the distrubution of an angle in polar coordinates and a standard histogram / KDE plot. Parameters ---------- data: array-like (nsamples,) N: int, optional (default: 50) Number of bins to use for histogramming the data title: str, optional (default: None) The title of the plot ax1: matplotlib axis, optional The left hand side polar plot ax2: matplotlib axis, optional The right hand side density plot color: str, optional (default: None) A color string to use wrap: bool, optional (default: True) True: Wrap the angle between -180 and 180 False: Constrain the angle between 0 and 360 Returns ------- f: matplotlib.figure The figure with both axis ax1: matplotlib axis, optional The left hand side polar plot ax2: matplotlib axis, optional The right hand side density plot """ if ax1 is None or ax2 is None: gs = gridspec.GridSpec(2, 6) ax1 = pp.subplot(gs[:1, :2], polar=True) ax2 = pp.subplot(gs[:1, 2:]) if wrap: vf = np.vectorize(wrap_angle) else: vf = np.vectorize(constrain_angle) x = vf(data) sns.distplot(x, bins=N, ax=ax2, color=color, kde=True) radii, theta = np.histogram(x, bins=N, normed=True) ax1.set_yticklabels([]) if wrap: ax1ticks = [0, 45, 90, 135, 180, -135, -90, -45] ax2ticks = list(range(-180, 180 + 45, 45)) ax1.set_xticklabels(['{}°'.format(x) for x in ax1ticks]) ax2.set_xlim(-180, 180) ax2.set_xticks(ax2ticks) ax2.set_xticklabels(['{}°'.format(x) for x in ax2ticks]) else: ax2ticks = list(range(0, 360 + 45, 45)) ax2.set_xlim(0, 360) ax2.set_xticks(ax2ticks) ax2.set_xticklabels(['{}°'.format(x) for x in ax2ticks]) ax2.set_yticks([]) ax2.set(xlabel='Angle', ylabel='Density') sns.despine(ax=ax2) width = (2 * np.pi) / N ax1.bar(np.deg2rad(theta[1:]), radii, width=width, color=color, alpha=.5) if title is not None: pp.suptitle(title) pp.tight_layout() f = pp.gcf() return f, (ax1, ax2)
def init_artists(self, ax, plot_data, plot_kwargs): return {'axis': sns.distplot(*plot_data, ax=ax, **plot_kwargs)}
import seaborn.apionly as sns import numpy as np np.random.seed(0) import matplotlib.pyplot as plt x = np.random.randn(100) print(type(x)) ax = sns.distplot(x, hist_kws={"ec": "k"}) data_x, data_y = ax.lines[0].get_data() print(data_x) print(data_y) xi = 0 # coordinate where to find the value of kde curve yi = np.interp(xi, data_x, data_y) print("x={},y={}".format(xi, yi)) # prints x=0,y=0.3698 ax.plot([0], [yi], marker="o") fig, subplots = plt.subplots(1, 1) subplots.plot(data_x, data_y) plt.show()
# In[350]: categ = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Alone', 'Survived'] conti = ['Fare', 'Age'] #Distribution fig = plt.figure(figsize=(16, 12)) for i in range(0, len(categ)): fig.add_subplot(3, 3, i + 1) sns.countplot(x=categ[i], data=df, alpha=.7) for col in conti: fig.add_subplot(3, 3, i + 2) sns.distplot(df[col].dropna(), kde_kws={ "lw": 2, "color": colors[8] }, hist_kws={"alpha": .5}) i += 1 plt.show() # In[373]: fig = plt.figure(figsize=(16, 10)) i = 1 for col in categ: if col != 'Survived': fig.add_subplot(3, 3, i) g = sns.countplot(x=col, data=df, hue='Survived', alpha=.7) plt.legend(loc=1)
def posterior_predictive_check(): modeltype = 'hurdle' # modeltype = 'negbin' if modeltype =='hurdle': y = df['days_to_first_price_update'].values else: y = dfa['days_to_first_price_update'].values y_full = y #### Import y-pred from RStan ypred_full = np.asarray(pd.read_fwf('ypred_m7.rdat')) # brms hurdle model ypred_full = np.asarray(pd.read_fwf('ypred_m8.rdat')) # brms hurdle model, main:spec # ypred_full = np.asarray(pd.read_fwf('ypred_m6.rdat')) # brms negbin trunc zero # y_pred = np.asarray(pd.read_fwf('ypred_m7.rdat')) # rstanarm negbin direction = 'Up' # direction = 'Down' # direction = 'None' direction = 'Full' amends_direction = np.where(df.amends==direction)[0] if direction in ["Up", "Down", "None"]: y = y_full[amends_direction] y_pred = ypred_full[:, amends_direction] else: y = y_full y_pred = ypred_full # pp_check: compare distibutions of test statistics T(y) vs. T(yrep) def count_zeros(x): return Counter(x)[0] test_stats = [np.mean, np.min, np.max, np.median, np.std, count_zeros] tnames = ['Mean', 'Min', 'Max', 'Median', 'St-Dev', 'Number of Zeros'] for test_stat, tname in zip(test_stats, tnames): fig = plt.figure(figsize=(10,3)) fig.add_subplot(111) test_stats_rep = [test_stat(yy) for yy in y_pred] bayes_pval = round(len([1 for yrep in test_stats_rep if yrep > test_stat(y)]) / len(test_stats_rep), 3) pval = r"$Pr(T(y_{{rep}}) > T(y_{{obs}}) | y_{{obs}}) = {}$".format(bayes_pval) ax0 = sb.distplot(test_stats_rep, kde=False, label=r"$T(y_{rep})$") ax0.axvline(test_stat(y), color=blue, linewidth=4, label=r"$T(y_{obs})$") ab = AnnotationBbox(TextArea(pval, textprops={'fontsize':14}), (570, 100), xycoords='figure points', bboxprops={'boxstyle': 'square', 'fc':'#efefef', 'ec': '#9f9f9f'}) ax0.add_artist(ab) plt.title("Test statistic: {}".format(tname)) plt.legend(fontsize=14) plt.savefig("fig-{}-{}-hurdle".format(direction, tname)) plt.close() # if modeltype == 'negbin': # fig = plt.figure(figsize=(10,3)) # fig.add_subplot(111) # test_stats_rep = [test_stat(yy) for yy in y_pred] # bayes_pval = round(len([1 for yrep in test_stats_rep if yrep > test_stat(y)]) / len(test_stats_rep), 3) # pval = r"$Pr(T(y_{{rep}}) > T(y_{{obs}}) | y_{{obs}}) = {}$".format(bayes_pval) # ax0 = sb.distplot(test_stats_rep, kde=False, label=r"$T(y_{rep})$") # ax0.axvline(test_stat(y), color=blue, linewidth=4, label=r"$T(y_{obs})$") # ab = AnnotationBbox(TextArea(pval, # textprops={'fontsize':14}), (570, 100), # xycoords='figure points', # bboxprops={'boxstyle': 'square', 'fc':'#efefef', 'ec': '#9f9f9f'}) # ax0.add_artist(ab) # plt.title("Test statistic: {}".format(tname)) # plt.legend(fontsize=14) # plt.savefig("fig-teststat-{}".format(test_stat.__name__)) # Posterior Predictive Distributions x_lim=120 y_lim=50 if direction == 'Up': color=blue elif direction == 'Down': color=purple elif direction == 'None': color='black' else: color=red # observed y fig = plt.figure(figsize=(10,6)) ax1 = fig.add_subplot(211) _ = plt.hist(y, range=[0, x_lim], bins=x_lim, histtype='stepfilled', alpha=0.7, color=color) _ = plt.title('Panel A: Distribution of Observed Data', fontsize='large') ax1.axvline(np.mean(y), linestyle='-', color='black', label='Mean') ax1.axvline(np.percentile(y, 50), linestyle='--', color='black', label='Median') ax1.axvline(np.percentile(y, 5), linestyle='-.', color='black', label=r'$5^{th}$ and $95^{th}$ Percentile') ax1.axvline(np.percentile(y, 95), linestyle='-.', color='black') _ = plt.ylabel('Frequency') # _ = plt.xlabel('Days to Price Amendment') plt.ylim(0, y_lim) ax2 = fig.add_subplot(212) ypred2 = y_pred[-200:-100:20] ax2.axvline(np.mean(ypred2), linestyle='-', color='black', label='Mean') ax2.axvline(np.percentile(ypred2, 50), linestyle='--', color='black', label='Median') ax2.axvline(np.percentile(ypred2, 5), linestyle='-.', color='black', label=r'$5^{th}$ and $95^{th}$ Percentile') ax2.axvline(np.percentile(ypred2, 95), linestyle='-.', color='black') _ = [plt.hist(y, range=[0, x_lim], bins=x_lim, histtype='stepfilled', alpha=0.2, color=color) for y in ypred2] _ = plt.xlim(0, x_lim) _ = plt.title('Panel B: Posterior Predictive Distribution', fontsize='large') _ = plt.ylabel('Frequency') _ = plt.xlabel('Days to Price Amendment') plt.legend(fontsize='small') plt.ylim(0, y_lim) # ax3 = fig.add_subplot(313) # ypred3 = y_pred[-300:-200:20] # ax3.axvline(np.mean(ypred3), linestyle='-', color='black', label='Mean') # ax3.axvline(np.percentile(ypred3, 50), linestyle='--', color='black', label='Median') # ax3.axvline(np.percentile(ypred3, 5), linestyle='-.', color='black', label=r'$5^{th}$ and $95^{th}$ Percentile') # ax3.axvline(np.percentile(ypred3, 95), linestyle='-.', color='black') # _ = [plt.hist(y, range=[0, x_lim], bins=x_lim, histtype='stepfilled', alpha=0.2, color=blue) for y in ypred3] # _ = plt.xlim(0, x_lim) # _ = plt.xlabel('Days to Price Amendment') # _ = plt.title('Panel C: Posterior Predictive Distribution (#2)', fontsize='large') # plt.legend(fontsize='small') if modeltype == 'hurdle': plt.savefig('figA_posterior-pred-check-hurdle-{}'.format(direction)) elif modeltype == 'negbin': plt.savefig('figA_posterior-pred-check2') else: plt.savefig('figA_posterior_pred-check?')
size=4000) # Pick 4000 people, and give them groups full_counts = np.random.poisson(lams[choices]) # Count their visits truncated_counts = full_counts[full_counts > 0] # Remove any counts that are zero truncated_choices = choices[ full_counts > 0] # And also find the groups for those non-zero visitors trunc_size = truncated_counts.size colors = sns.color_palette(n_colors=2) #%% Setup/Plot Dummy Data lam = 1 full_counts = np.random.poisson(lam, size=2000) truncated_counts = full_counts[full_counts > 0] sns.distplot(full_counts, bins=np.arange(10), kde=False, norm_hist=True) sns.distplot(truncated_counts, bins=np.arange(10), kde=False, norm_hist=True) #%% Full Counts with pm.Model(): lam = pm.HalfNormal('lam', 10) pm.Poisson('obs', mu=lam, observed=full_counts) trace = pm.sample(2500, cores=1) pm.traceplot(trace) plt.figure() sns.distplot(trace.lam) plt.axvline(1)
def pdf_sns(y,nBins=50): import seaborn.apionly as sns hh=sns.distplot(y,hist=True,norm_hist=False).get_lines()[0].get_data() xh=hh[0] yh=hh[1] return xh,yh
def plot_pbo(pbo_result, hist=False): lm = pbo_result.linear_model wid, h = plt.rcParams.get('fig.figsize', (10, 5)) nplots = 3 fig, axarr = plt.subplots(nplots, 1, sharex=False) fig.set_size_inches((wid, h * nplots)) r2 = lm.rvalue**2 # adj_r2 = r2 - (1 - r2) / (len(pbo_result.R_n_star) - 2.0) line_label = 'slope: {:.4f}\n'.format(lm.slope) + \ 'p: {:.4E}\n'.format(lm.pvalue) + \ '$R^2$: {:.4f}\n'.format(r2) + \ 'Prob. OOS Loss: {:.1%}'.format(pbo_result.prob_oos_loss) sns.regplot( x='SR_IS', y='SR_OOS', # sns.lmplot(x='SR_IS', y='SR_OOS', data=pd.DataFrame( dict(SR_IS=pbo_result.R_n_star, SR_OOS=pbo_result.R_bar_n_star)), scatter_kws={ 'alpha': .3, 'color': 'g' }, line_kws={ 'alpha': .8, 'label': line_label, 'linewidth': 1., 'color': 'r' }, ax=axarr[0]) axarr[0].set_title('Performance Degradation, IS vs. OOS') axarr[0].legend(loc='best') # TODO hist is turned off at the moment. Error occurs when S is set to # a relatively large number, such as 16. sns.distplot(pbo_result.logits, rug=True, bins=10, ax=axarr[1], rug_kws={ 'color': 'r', 'alpha': .5 }, kde_kws={ 'color': 'k', 'lw': 2., 'label': 'KDE' }, hist=hist, hist_kws={ 'histtype': 'step', 'linewidth': 2., 'alpha': .7, 'color': 'g' }) axarr[1].axvline(0, c='r', ls='--') axarr[1].set_title('Hist. of Rank Logits') axarr[1].set_xlabel('Logits') axarr[1].set_ylabel('Frequency') pbo_result.stochastic.plot(secondary_y='SD2', ax=axarr[2]) axarr[2].right_ax.axhline(0, c='r') axarr[2].set_title('Stochastic Dominance') axarr[2].set_ylabel('Frequency') axarr[2].set_xlabel('SR Optimized vs. Non-Optimized') axarr[2].right_ax.set_ylabel('2nd Order Stoch. Dominance') plt.show()
ben_freq = 0 for w in y: if w == 'M': mal_freq += 1 if w == 'B': ben_freq += 1 print("Malignant: " + str(mal_freq)) print("Benign: " + str(ben_freq)) # In[5]: # This plots the mean radii of the tumors and color codes if they are benign or malignant. datas = data[data.type == 'M'] # this takes the ones that are type M sns.distplot(datas['mean radius'], kde=False, label='Malignant') # this plots the mean radii of the malignant ones datas = data[data.type == 'B'] # this takes the ones that are type B sns.distplot(datas['mean radius'], kde=False, label='Benign') # this plots the mean radii of the benign ones plt.legend() # this adds a legend plt.title("Mean Radii") # title plt.xlabel("mean radius") # labels plt.ylabel("frequency") # In[6]: # This plots the mean texture of the tumors and color codes if they are benign or malignant. datas = data[data.type == 'M'] # this takes the ones that are type M sns.distplot(datas['mean texture'], kde=False, label='Malignant') # this plots the mean radii of the malignant ones datas = data[data.type == 'B'] # this takes the ones that are type B