def test_empty_breaks(): x = [] assert custom_format()(x) == [] assert comma_format()(x) == [] assert currency_format()(x) == [] assert percent_format()(x) == [] assert scientific_format()(x) == [] assert date_format()(x) == [] assert mpl_format()(x) == [] assert log_format()(x) == [] assert timedelta_format()(x) == []
def test_percent_format(): formatter = percent_format() # same/nearly same precision values assert formatter([.12, .23, .34, .45]) == \ ['12%', '23%', '34%', '45%'] assert formatter([.12, .23, .34, 4.5]) == \ ['12%', '23%', '34%', '450%'] # mixed precision values assert formatter([.12, .23, .34, 45]) == \ ['10%', '20%', '30%', '4500%']
def plot_elos(): diffs = np.linspace(-1000, +1000) rates = 1 / (1 + 10**(-diffs / 400)) df = pd.DataFrame({'elo': diffs, 'winrate': rates}) return (pn.ggplot(df) + pn.geom_line(pn.aes(x='elo', y='winrate')) + pn.geom_vline(xintercept=0, alpha=.1) + pn.geom_hline(yintercept=.5, alpha=.1) + pn.labs(x='Own Elo relative to opponent\'s Elo', y='Win rate v. opponent') + pn.scale_y_continuous(labels=percent_format()) + pn.coord_cartesian(expand=False) + plot.IEEE())
def plot_calibrations(): params = data.sample_calibrations() return ( pn.ggplot( params, pn.aes(xmin='boardsize-.25', xmax='boardsize+.25', group='boardsize', fill='factor(boardsize)')) + pn.geom_hline(yintercept=.5, alpha=.2) + pn.geom_rect( pn.aes(ymin='lower', ymax='upper'), show_legend=False, color='k') + pn.geom_rect(pn.aes(ymin='mid', ymax='mid'), show_legend=False, color='k', size=2) + pn.scale_y_continuous(labels=percent_format()) + pn.scale_fill_hue(l=.4) + pn.coord_cartesian(ylim=(.4, .6)) + pn.labs(y='Win rate v. perfect play', x='Board size') + plot.IEEE())
def plot_importance_lgb(importance): # Ugly but pip install problem on Airflow otherwise import plotnine as pn from plotnine import ggplot, aes # noqa # from plotnine.geoms import * # noqa coef = 1.5 pn.options.figure_size = (6.4 * coef, 4.8 * coef) from mizani.formatters import percent_format # noqa # from mizani.breaks import date_breaks # noqa # from mizani.formatters import date_format # noqa importance['importance'] = importance['importance'] / 100 importance['feature'] = pd.Categorical(importance['feature'], importance['feature'][::-1], ordered=True) plot = (ggplot(importance, aes('feature', 'importance')) + pn.geom_bar(stat='identity') + pn.coords.coord_flip() + pn.scales.scale_y_continuous(labels=percent_format()) + pn.labs(title='Feature importance', x='Feature', y='Gain')) return plot
xax = axes[-1].xaxis xax.set_ticklabels(lf(10.0**ex) for ex in xax.get_ticklocs()) plt.xlabel("growth rate [1/h]") plt.ylabel("tradeoff") plt.savefig("figures/dists.svg") plt.close() non_zero = (rates.groupby([ "id", "tradeoff" ]).apply(lambda df: (df.growth_rate > 1e-6).sum() / df.shape[0]).reset_index( name="non_zero")) pl = (ggplot(non_zero, aes(x="tradeoff", y="non_zero")) + geom_boxplot(outlier_color="none") + geom_jitter(width=0.15, height=0, alpha=0.5, stroke=0) + scale_y_continuous(labels=percent_format()) + labs(x="tradeoff", y="percent taxa growing")) pl.save("figures/percent_growing.svg", width=5, height=5) # Show some representative correlations comp = both[(both.tradeoff == "0.5")] w = within_samples[within_samples.tradeoff == "0.5"] w.index = w.id comp = comp[comp.id.isin(w.id[(w.n >= 10)])] comp["rho"] = w.loc[comp.id, "rho"].round(2).values comp.id = comp.id + " (r=" + comp.rho.astype(str) + ")" pl = (ggplot(comp, aes(x="rate", y="growth_rate")) + geom_point() + facet_wrap("~ id", nrow=3) + scale_x_log10() + scale_y_log10() + labs(x="replication rate [a.u.]", y="predicted growth rate [1/h]")) pl.save("figures/corr_examples.png", width=12, height=6, dpi=300)
gg_rep_act.save(os.path.join(dir_output, 'gg_rep_act.png'), width=8, height=4) di_notes = { 'chi2': 'χ2-correction', 'insig': 'Erroneous', 'specification': 'Specification', 'non-replicable': 'Inconsistent' } # (ii) Breakdown of counts tmp = acc_tt.merge( res_fisher.tt.value_counts().reset_index().rename(columns={ 'index': 'tt', 'tt': 'n_lit' })) tmp = tmp.assign(tt=lambda x: x.tt.map(di_tt), notes=lambda x: x.notes.map(di_notes), share=lambda x: x.n / x.n_lit) gg_acc_notes = ( pn.ggplot(tmp, pn.aes(x='notes', y='share', fill='tt')) + pn.theme_bw() + pn.scale_y_continuous(labels=percent_format(), limits=[0, 0.1]) + pn.scale_fill_discrete(name='Literature') + pn.geom_col(color='black', position=pn.position_dodge(0.5), width=0.5) + pn.labs(y='Percent', x='Investigation') + pn.theme(axis_text_x=pn.element_text(angle=45), axis_title_x=pn.element_blank())) gg_acc_notes.save(os.path.join(dir_output, 'gg_acc_notes.png'), width=7, height=3) print('~~~ End of 4_results_insig.py ~~~')
def density_plot(df_original, variable, target=False, no_outliers=False, title=None, mean=True, mode=True, q1=True, q3=True): ''' :param df: dataframe, variable: column to plot, target: object variable for split graph :return: ggplot graphs of univariate analysis by dtype: histogram, frequency, unique categories, :usage: after cleaning dataframe :target: is a string with the name of the categorical variable selected as target variable ''' # target as str df = df_original.copy() df[target] = df[target].apply(str) x_label = variable.lower().replace("_", " ").title() graph = (ggplot(df) + aes(x=variable, y='..scaled..', fill=target)) # target if target == False: graph += geom_density(fill=colors.FIRST_COLOR) else: fill_label = target.lower().replace("_", " ").title() graph += geom_density(alpha=.5) graph += scale_fill_manual(values=[colors.FIRST_COLOR, colors.SECOND_COLOR], name=fill_label) graph += (theme_bw() + theme( axis_line_x=element_line(color='gray'), axis_line_y=element_line(color='gray'), line=element_line(color='white') ) ) # labels graph += xlab(x_label) + ylab("Density") graph += scale_y_continuous(labels=percent_format()) # custom_format('{:.2f} USD') line_args = { "color": colors.THIRD_COLOR, "size": .5 } var_describe = df[variable].describe() if mean: graph += geom_vline(xintercept=var_describe.loc["mean"], linetype="dashed", **line_args ) if mode: graph += geom_vline(xintercept=var_describe.loc["50%"], linetype="solid", **line_args) if q1: graph += geom_vline(xintercept=var_describe.loc["25%"], linetype="solid", **line_args) if q3: graph += geom_vline(xintercept=var_describe.loc["75%"], linetype="solid", **line_args) # title if not title is None: graph += ggtitle(str(title)) # no outliers if no_outliers: max_75 = df[variable].describe().loc['75%'] min_25 = df[variable].describe().loc['25%'] graph += xlim(min_25, max_75) # show graph.draw() plt.show()
scale_x_datetime(date_breaks='5 years', date_labels='%Y') + scale_color_discrete(name='HPI', labels=['CREA', 'Teranet'])) gg_save('gg_tera_crea_lvl.png', dir_figures, gg_tera_crea_lvl, 12, 5) # (iii) CREA vs Teranet m/m tmp = df_hpi_both.groupby( ['city', 'hpi']).apply(lambda x: x.value / x.value.shift(1) - 1).reset_index() df_hpi_w = df_hpi_both.assign(mm=tmp.sort_values('level_2').value.values) df_hpi_w = df_hpi_w.pivot_table('mm', ['date', 'city'], 'hpi').reset_index() df_hpi_w = df_hpi_w.sort_values(['city', 'date']).reset_index(None, True) gg_tera_crea_pct = (ggplot(df_hpi_w, aes(x='crea', y='tera')) + geom_point(size=0.5) + theme_bw() + theme(axis_text_x=element_text(angle=90)) + scale_x_continuous(labels=percent_format()) + scale_y_continuous(labels=percent_format()) + labs(x='CREA', y='Teranet', title='month-over-month %') + facet_wrap('~city', nrow=2)) gg_save('gg_tera_crea_pct.png', dir_figures, gg_tera_crea_pct, 12, 5) # (iv) Find the optimal correlation lag_seq = np.arange(13) alpha = 0.1 n_bs = 1000 holder = [] for lag in lag_seq: print(lag) tmp_lag = df_hpi_w.assign( crea=df_hpi_w.groupby('city').crea.shift(lag)).dropna() tmp_lag_bs = tmp_lag.groupby('city').sample(frac=n_bs,