def test_wrong_bases(): # x axis not transformed p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75, base=10) + geom_point()) with pytest.warns(PlotnineWarning): p.draw_test() # x axis not transform, but ticks requested for a different base p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75, base=10) + scale_x_continuous(trans=log_trans(8)) + geom_point()) with pytest.warns(PlotnineWarning): p.draw_test() # x axis is discrete df2 = df.assign(discrete=pd.Categorical([str(a) for a in df['x']])) p = (ggplot(df2, aes('discrete', 'x')) + annotation_logticks(sides='b', size=.75, base=None) + geom_point()) with pytest.warns(PlotnineWarning): p.draw_test() # y axis is discrete df2 = df.assign(discrete=pd.Categorical([str(a) for a in df['x']])) p = (ggplot(df2, aes('x', 'discrete')) + annotation_logticks(sides='l', size=.75, base=None) + geom_point()) with pytest.warns(PlotnineWarning): p.draw_test() # x axis is discrete + coord flip. df2 = df.assign(discrete=pd.Categorical([str(a) for a in df['x']])) p = (ggplot(df2, aes('discrete', 'x')) + annotation_logticks(sides='b', size=.75, base=None) + geom_point() + coord_flip()) with pytest.warns(PlotnineWarning): p.draw_test() # y axis is discrete + coord_flip df2 = df.assign(discrete=pd.Categorical([str(a) for a in df['x']])) p = (ggplot(df2, aes('x', 'discrete')) + annotation_logticks(sides='l', size=.75, base=None) + geom_point() + coord_flip()) with pytest.warns(PlotnineWarning): p.draw_test()
def plot_data_point(self, data_point_ix, use_base=True, figure_size=(8, 6)): """ Plot Shapley values for an individual data point Parameters ---------- data_point_ix : int use_base : boolean, optional default=True Returns ------- g : ggplot object """ # Check Shapley values exist if self._shapley_values is None: raise Exception("No Shapley values are available") d = self.get_shapley_values().loc[[data_point_ix]] if not use_base: d = d.drop("BASE", axis=1) g = (ggplot(d.reset_index(drop=False).melt( id_vars="index"), aes(x="variable", y="value", fill="variable")) + geom_bar(stat="identity") + labs(title="Shapley values (Index: " + str(data_point_ix) + ")", x="Feature", y="Shapley value", fill="Feature") + coord_flip()) g += theme(figure_size=figure_size) return g
def test_annotation_logticks_coord_flip(): p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75) + geom_point() + scale_x_log10() + scale_y_log10() + coord_flip() + theme(panel_grid_minor=element_line(color='green'), panel_grid_major=element_line(color='red'))) assert p == 'annotation_logticks_coord_flip'
def plot_breakdown(cip_df: pd.DataFrame): """Stacked bar plot of increasing and decreasing stocks per sector in the specified df""" cols_to_drop = [colname for colname in cip_df.columns if colname.startswith('bin_')] df = cip_df.drop(columns=cols_to_drop) df = pd.DataFrame(df.sum(axis='columns'), columns=['sum']) df = df.merge(stocks_by_sector(), left_index=True, right_on='asx_code') if len(df) == 0: # no stock in cip_df have a sector? ie. ETF? return None assert set(df.columns) == set(['sum', 'asx_code', 'sector_name']) df['increasing'] = df.apply(lambda row: 'up' if row['sum'] >= 0.0 else 'down', axis=1) sector_names = df['sector_name'].value_counts().index.tolist() # sort bars by value count (ascending) sector_names_cat = pd.Categorical(df['sector_name'], categories=sector_names) df = df.assign(sector_name_cat=sector_names_cat) #print(df) plot = ( p9.ggplot(df, p9.aes(x='factor(sector_name_cat)', fill='factor(increasing)')) + p9.geom_bar() + p9.labs(x="Sector", y="Number of stocks") + p9.theme(axis_text_y=p9.element_text(size=7), subplots_adjust={"left": 0.2, 'right': 0.85}, legend_title=p9.element_blank() ) + p9.coord_flip() ) return plot_as_inline_html_data(plot)
def plot_bargraph(count_plot_df, plot_df): """ Plots the bargraph Arguments: count_plot_df - The dataframe that contains lemma counts plot_df - the dataframe that contains the odds ratio and lemmas """ graph = ( p9.ggplot(count_plot_df.astype({"count": int}), p9.aes(x="lemma", y="count")) + p9.geom_col(position=p9.position_dodge(width=0.5), fill="#253494") + p9.coord_flip() + p9.facet_wrap("repository", scales='free_x') + p9.scale_x_discrete(limits=(plot_df.sort_values( "odds_ratio", ascending=True).lemma.tolist())) + p9.scale_y_continuous(labels=custom_format('{:,.0g}')) + p9.labs(x=None) + p9.theme_seaborn( context='paper', style="ticks", font="Arial", font_scale=0.95) + p9.theme( # 640 x 480 figure_size=(6.66, 5), strip_background=p9.element_rect(fill="white"), strip_text=p9.element_text(size=12), axis_title=p9.element_text(size=12), axis_text_x=p9.element_text(size=10), )) return graph
def plot_boxplot_series(df, normalisation_method=None): """ Treating each column as a separate boxplot and each row as an independent observation (ie. different company) render a series of box plots to identify a shift in performance from the observations. normalisation_method should be one of the values present in SectorSentimentSearchForm.normalisation_choices """ # and plot the normalised data if normalisation_method is None or normalisation_method == "1": normalized_df = df y_label = "Percentage change" elif normalisation_method == "2": normalized_df = (df - df.min()) / (df.max() - df.min()) y_label = "Percentage change (min/max. scaled)" else: normalized_df = df / df.max(axis=0) # div by max if all else fails... y_label = "Percentage change (normalised by dividing by max)" n_inches = len(df.columns) / 5 melted = normalized_df.melt(ignore_index=False).dropna() plot = (p9.ggplot(melted, p9.aes(x="fetch_date", y="value")) + p9.geom_boxplot(outlier_colour="blue") + p9.coord_flip()) return user_theme(plot, y_axis_label=y_label, figure_size=(12, n_inches))
def test_coord_flip(): p = (ggplot(df) + geom_rug(aes('x', 'y'), size=2, sides='l') + coord_flip() ) assert p + _theme == 'coord_flip'
def plot_pointplot(plot_df, y_axis_label="", use_log10=False, limits=[0, 3.2]): """ Plots the pointplot Arguments: plot_df - the dataframe that contains the odds ratio and lemmas y_axis_label - the label for the y axis use_log10 - use log10 for the y axis? """ graph = ( p9.ggplot(plot_df, p9.aes(x="lemma", y="odds_ratio")) + p9.geom_pointrange(p9.aes(ymin="lower_odds", ymax="upper_odds"), position=p9.position_dodge(width=1), size=0.3, color="#253494") + p9.scale_x_discrete(limits=(plot_df.sort_values( "odds_ratio", ascending=True).lemma.tolist())) + (p9.scale_y_log10() if use_log10 else p9.scale_y_continuous( limits=limits)) + p9.geom_hline(p9.aes(yintercept=1), linetype='--', color='grey') + p9.coord_flip() + p9.theme_seaborn( context='paper', style="ticks", font_scale=1, font='Arial') + p9.theme( # 640 x 480 figure_size=(6.66, 5), panel_grid_minor=p9.element_blank(), axis_title=p9.element_text(size=12), axis_text_x=p9.element_text(size=10)) + p9.labs(x=None, y=y_axis_label)) return graph
def cell_cycle_phase_barplot(adata, palette='Set2'): """Plots the proportion of cells in each phase of the cell cycle See also: cell_cycle_phase_pieplot for the matplotlib pie chart Parameters ----------- adata: AnnData The AnnData object being used for the analysis. Must be previously evaluated by `tl.annotate_cell_cycle`. Returns ----------- A plotnine barplot with the total counts of cell in each phase of the cell cycle. """ plt_data = adata.obs.copy() plt_data['cell_cycle_phase'] = pd.Categorical( plt_data['cell_cycle_phase'], categories=['G1 post-mitotic', 'G1 pre-replication', 'S/G2/M']) cycle_plot = ( ggplot(plt_data, aes('cell_cycle_phase', fill='cell_cycle_phase')) + geom_bar() + coord_flip() + guides(fill=False) + labs(y='', x='Cell cycle phase') + theme_light() + theme(panel_grid_major_y=element_blank(), panel_grid_minor_y=element_blank(), panel_grid_major_x=element_line(size=1.5), panel_grid_minor_x=element_line(size=1.5)) + scale_fill_brewer(type='qual', palette=palette)) return cycle_plot
def plot_point_scores(stock: str, sector_companies, all_stocks_cip: pd.DataFrame, rules): """ Visualise the stock in terms of point scores as described on the stock view page. Rules to apply can be specified by rules (default rules are provided by rule_*()) Points are lost for equivalent downturns and the result plotted. All rows in all_stocks_cip will be used to calculate the market average on a given trading day, whilst only sector_companies will be used to calculate the sector average. A utf-8 base64 encoded plot image is returned """ assert len(stock) >= 3 assert all_stocks_cip is not None assert rules is not None and len(rules) > 0 rows = [] points = 0 day_low_high_df = day_low_high(stock, all_dates=all_stocks_cip.columns) state = { "day_low_high_df": day_low_high_df, # never changes each day, so we init it here "all_stocks_change_in_percent_df": all_stocks_cip, "stock": stock, "daily_range_threshold": 0.20, # 20% at either end of the daily range gets a point } net_points_by_rule = defaultdict(int) for date in all_stocks_cip.columns: market_avg = all_stocks_cip[date].mean() sector_avg = all_stocks_cip[date].filter(items=sector_companies).mean() stock_move = all_stocks_cip.at[stock, date] state.update( { "market_avg": market_avg, "sector_avg": sector_avg, "stock_move": stock_move, "date": date, } ) points += sum(map(lambda r: r(state), rules)) for r in rules: k = r.__name__ if k.startswith("rule_"): k = k[5:] net_points_by_rule[k] += r(state) rows.append({"points": points, "stock": stock, "date": date}) df = pd.DataFrame.from_records(rows) df["date"] = pd.to_datetime(df["date"]) point_score_plot = plot_series(df, x="date", y="points") rows = [] for k, v in net_points_by_rule.items(): rows.append({"rule": str(k), "net_points": v}) df = pd.DataFrame.from_records(rows) net_rule_contributors_plot = ( p9.ggplot(df, p9.aes(x="rule", y="net_points")) + p9.labs(x="Rule", y="Contribution to points by rule") + p9.geom_bar(stat="identity") + p9.theme(axis_text_y=p9.element_text(size=7), subplots_adjust={"left": 0.2}) + p9.coord_flip() ) return point_score_plot, plot_as_inline_html_data(net_rule_contributors_plot)
def test_annotation_stripes_coord_flip(): p = (ggplot(df) + annotation_stripes() + geom_point(aes('factor(x)', 'y')) + geom_vline(xintercept=[0.5, 1.5, 2.5, 3.5]) + coord_flip() ) assert p == 'annotation_stripes_coord_flip'
def test_annotation_logticks_coord_flip_discrete_bottom(): df2 = df.assign(discrete=pd.Categorical(['A' + str(a) for a in df['x']])) p = (ggplot(df2, aes('x', 'discrete')) + annotation_logticks(sides='b', size=.75) + geom_point() + scale_x_log10() + coord_flip() + theme(panel_grid_minor=element_line(color='green'), panel_grid_major=element_line(color='red'))) assert p == 'annotation_logticks_coord_flip_discrete_bottom'
def plotfreq(freqdf): ''' ---------- Parameters ---------- freqdf dataframe generated by freq() Returns ------- Bar chart with frequencies & percentages in descending order Example ------- import exploretransform as et df, X, y = et.loadboston() et.plotfreq(et.freq(X['town'])) Warning ------- This function will likely not plot more than 100 unique levels properly. ---------- ''' # input checks if isinstance(freqdf, (pd.core.frame.DataFrame)): pass else: return print("\nFunction only accetps dataframes\n") if len(freqdf.columns) == 4: pass else: return print("\nInput must be a dataframe generated by freq()\n") if sum(freqdf.columns[1:4] == ['freq', 'perc', 'cump']) == 3: pass else: return print("\nInput must be a dataframe generated by freq()\n") if len(freqdf) < 101: pass else: return print("\nUnable to plot more than 100 items") # label for plot lbl = freqdf['freq'].astype(str).str.cat( '[ ' + freqdf['perc'].astype(str) + '%' + ' ]', sep=' ') # create variable to be used in aes aesx = 'reorder(' + freqdf.columns[0] + ', freq)' # build plot plot = (pn.ggplot(freqdf) + pn.aes(x=aesx, y='freq', fill='freq', label=lbl) + pn.geom_bar(stat='identity') + pn.coord_flip() + pn.theme(axis_text_y=pn.element_text(size=6, weight='bold'), legend_position='none') + pn.labs(x=freqdf.columns[0], y="Freq") + pn.scale_fill_gradient2(mid='bisque', high='blue') + pn.geom_text(size=6, nudge_y=.7)) return plot
def barplot(df, key, figsize=(8, 6), vertical=False): if vertical: figsize = tuple(list(reversed(list(figsize)))) p9.options.figure_size = figsize top_l = df[key].value_counts().index.tolist() df[key] = pd.Categorical(df[key], categories=reversed(top_l)) fig = p9.ggplot(p9.aes(x=key, y='..count..', label='..count..'), data=df) fig += p9.geom_bar(alpha=0.5) if vertical: fig += p9.coord_flip() fig += p9.stat_count(geom="text", position=p9.position_stack(vjust=0.5), size=10) fig += p9.theme_classic() return fig
def plot_boxplot_series(df, normalisation_method=None): """ Treating each column as a separate boxplot and each row as an independent observation (ie. different company) render a series of box plots to identify a shift in performance from the observations. normalisation_method should be one of the values present in SectorSentimentSearchForm.normalisation_choices """ # compute star performers: those who are above the mean on a given day counted over all days count = defaultdict(int) for col in df.columns: avg = df.mean(axis=0) winners = df[df[col] > avg[col]][col] for winner in winners.index: count[winner] += 1 winner_results = [] for asx_code, n_wins in count.items(): x = df.loc[asx_code].sum() # avoid "dead cat bounce" stocks which fall spectacularly and then post major increases in percentage terms if x > 0.0: winner_results.append((asx_code, n_wins, x)) # and plot the normalised data if normalisation_method is None or normalisation_method == "1": normalized_df = df y_label = "Percentage change" elif normalisation_method == "2": normalized_df = (df - df.min()) / (df.max() - df.min()) y_label = "Percentage change (min/max. scaled)" else: normalized_df = df / df.max(axis=0) # div by max if all else fails... y_label = "Percentage change (normalised by dividing by max)" n_inches = len(df.columns) / 5 melted = normalized_df.melt(ignore_index=False).dropna() plot = ( p9.ggplot(melted, p9.aes(x="fetch_date", y="value")) + p9.geom_boxplot(outlier_colour="blue") + p9.theme( axis_text_x=p9.element_text(size=7), axis_text_y=p9.element_text(size=7), figure_size=(12, n_inches), ) + p9.labs(x="Date (YYYY-MM-DD)", y=y_label) + p9.coord_flip() ) return ( plot_as_inline_html_data(plot), list(reversed(sorted(winner_results, key=lambda t: t[2]))), )
def test_annotation_stripes_coord_flip(): pdf = mtcars.assign(gear=pd.Categorical(mtcars.gear), am=pd.Categorical(mtcars.am)) p = ( ggplot(pdf) + annotation_stripes( fills=["#AAAAAA", "#FFFFFF", "#7F7FFF"], alpha=0.3) + geom_jitter( aes("gear", "wt", shape="gear", color="am"), random_state=5) + geom_vline(xintercept=0.5, color="black") + geom_vline(xintercept=1.5, color="black") + geom_vline(xintercept=2.5, color="black") + geom_vline(xintercept=3.5, color="black") + scale_shape_discrete(guide=guide_legend(order=1)) # work around #229 + coord_flip()) assert p == "annotation_stripes_coord_flip"
def plot_breakdown(ld: LazyDictionary) -> p9.ggplot: """Stacked bar plot of increasing and decreasing stocks per sector in the specified df""" cip_df = ld["cip_df"] cols_to_drop = [ colname for colname in cip_df.columns if colname.startswith("bin_") ] df = cip_df.drop(columns=cols_to_drop) df = pd.DataFrame(df.sum(axis="columns"), columns=["sum"]) ss = ld["stocks_by_sector"] # ss should be: # asx_code sector_name # asx_code # 14D 14D Industrials # 1AD 1AD Health Care # 1AG 1AG Industrials # 1AL 1AL Consumer Discretionary........ # print(ss) df = df.merge(ss, left_index=True, right_index=True) if len(df) == 0: # no stock in cip_df have a sector? ie. ETF? return None assert set(df.columns) == set(["sum", "asx_code", "sector_name"]) df["increasing"] = df.apply(lambda row: "up" if row["sum"] >= 0.0 else "down", axis=1) sector_names = (df["sector_name"].value_counts().index.tolist() ) # sort bars by value count (ascending) sector_names_cat = pd.Categorical(df["sector_name"], categories=sector_names) df = df.assign(sector_name_cat=sector_names_cat) # print(df) plot = (p9.ggplot( df, p9.aes(x="factor(sector_name_cat)", fill="factor(increasing)")) + p9.geom_bar() + p9.coord_flip()) return user_theme( plot, x_axis_label="Sector", y_axis_label="Number of stocks", subplots_adjust={ "left": 0.2, "right": 0.85 }, legend_title=p9.element_blank(), asxtrade_want_fill_d=True, )
def plot_points_by_rule(net_points_by_rule: defaultdict(int)) -> p9.ggplot: if net_points_by_rule is None or len(net_points_by_rule) < 1: return None rows = [] for k, v in net_points_by_rule.items(): rows.append({"rule": str(k), "net_points": v}) df = pd.DataFrame.from_records(rows) plot = ( p9.ggplot(df, p9.aes(x="rule", y="net_points", fill="net_points")) + p9.geom_bar(stat="identity", alpha=0.7) + p9.coord_flip()) return user_theme( plot, x_axis_label="Rule", y_axis_label="Contributions to points by rule", subplots_adjust={"left": 0.2}, asxtrade_want_fill_continuous=True, )
def protobowl(fold=BUZZER_DEV_FOLD): df_rnn = pickle.load( open("output/buzzer/RNNBuzzer/{}_protobowl.pkl".format(fold), "rb") ) df_rnn = df_rnn.groupby(["Possibility", "Outcome"]) df_rnn = df_rnn.size().reset_index().rename(columns={0: "Count"}) df_rnn["Model"] = pd.Series(["RNN" for _ in range(len(df_rnn))], index=df_rnn.index) df_mlp = pickle.load( open("output/buzzer/MLPBuzzer/{}_protobowl.pkl".format(fold), "rb") ) df_mlp = df_mlp.groupby(["Possibility", "Outcome"]) df_mlp = df_mlp.size().reset_index().rename(columns={0: "Count"}) df_mlp["Model"] = pd.Series(["MLP" for _ in range(len(df_mlp))], index=df_mlp.index) df_thr = pickle.load( open("output/buzzer/ThresholdBuzzer/{}_protobowl.pkl".format(fold), "rb") ) df_thr = df_thr.groupby(["Possibility", "Outcome"]) df_thr = df_thr.size().reset_index().rename(columns={0: "Count"}) df_thr["Model"] = pd.Series( ["Threshold" for _ in range(len(df_thr))], index=df_thr.index ) df = df_rnn.append(df_mlp, ignore_index=True) df = df.append(df_thr, ignore_index=True) outcome_type = CategoricalDtype(categories=[15, 10, 5, 0, -5, -10, -15]) df["Outcome"] = df["Outcome"].astype(outcome_type) model_type = CategoricalDtype(categories=["Threshold", "MLP", "RNN"]) df["Model"] = df["Model"].astype(model_type) p = ( ggplot(df) + geom_col(aes(x="Possibility", y="Count", fill="Outcome"), width=0.7) + facet_grid("Model ~") + coord_flip() + theme_fs() + theme(aspect_ratio=0.17) + scale_fill_brewer(type="div", palette=7) ) figure_dir = os.path.join("output/buzzer/{}_protobowl.pdf".format(fold)) p.save(figure_dir)
def plot_sector_top_eps_contributors( df: pd.DataFrame, stocks_by_sector_df: pd.DataFrame) -> p9.ggplot: """ Returns a plot of the top 20 contributors per sector, based on the most recent EPS value per stock in the dataframe. If no stocks in a given sector have positive EPS, the sector will not be plotted. """ most_recent_date = df.columns[-1] last_known_eps = df[most_recent_date] last_known_eps = last_known_eps[last_known_eps >= 0.0].to_frame() # print(stocks_by_sector_df) last_known_eps = last_known_eps.merge(stocks_by_sector_df, left_index=True, right_on="asx_code") last_known_eps["rank"] = last_known_eps.groupby( "sector_name")[most_recent_date].rank("dense", ascending=False) last_known_eps = last_known_eps[last_known_eps["rank"] <= 10.0] n_sectors = last_known_eps["sector_name"].nunique() last_known_eps["eps"] = last_known_eps[most_recent_date] plot = ( p9.ggplot( last_known_eps, p9.aes( y="eps", x="reorder(asx_code,eps)", # sort bars by eps within each sub-plot group="sector_name", fill="sector_name", ), ) + p9.geom_bar(stat="identity") + p9.facet_wrap("~sector_name", ncol=1, nrow=n_sectors, scales="free") + p9.coord_flip()) return user_theme( plot, y_axis_label="EPS ($AUD)", x_axis_label="Top 10 ASX stocks per sector as at {}".format( most_recent_date), subplots_adjust={"hspace": 0.4}, figure_size=(12, int(n_sectors * 1.5)), asxtrade_want_cmap_d=False, asxtrade_want_fill_d=True, )
def protobowl(fold=BUZZER_DEV_FOLD): df_rnn = pickle.load( open('output/buzzer/RNNBuzzer/{}_protobowl.pkl'.format(fold), 'rb')) df_rnn = df_rnn.groupby(['Possibility', 'Outcome']) df_rnn = df_rnn.size().reset_index().rename(columns={0: 'Count'}) df_rnn['Model'] = pd.Series(['RNN' for _ in range(len(df_rnn))], index=df_rnn.index) df_mlp = pickle.load( open('output/buzzer/MLPBuzzer/{}_protobowl.pkl'.format(fold), 'rb')) df_mlp = df_mlp.groupby(['Possibility', 'Outcome']) df_mlp = df_mlp.size().reset_index().rename(columns={0: 'Count'}) df_mlp['Model'] = pd.Series(['MLP' for _ in range(len(df_mlp))], index=df_mlp.index) df_thr = pickle.load( open('output/buzzer/ThresholdBuzzer/{}_protobowl.pkl'.format(fold), 'rb')) df_thr = df_thr.groupby(['Possibility', 'Outcome']) df_thr = df_thr.size().reset_index().rename(columns={0: 'Count'}) df_thr['Model'] = pd.Series(['Threshold' for _ in range(len(df_thr))], index=df_thr.index) df = df_rnn.append(df_mlp, ignore_index=True) df = df.append(df_thr, ignore_index=True) outcome_type = CategoricalDtype(categories=[15, 10, 5, 0, -5, -10, -15]) df['Outcome'] = df['Outcome'].astype(outcome_type) model_type = CategoricalDtype( categories=['Threshold', 'MLP', 'RNN']) df['Model'] = df['Model'].astype(model_type) p = ( ggplot(df) + geom_col(aes(x='Possibility', y='Count', fill='Outcome'), width=0.7) + facet_grid('Model ~') + coord_flip() + theme_fs() + theme(aspect_ratio=0.17) + scale_fill_brewer(type='div', palette=7) ) figure_dir = os.path.join('output/buzzer/{}_protobowl.pdf'.format(fold)) p.save(figure_dir)
def lollipop(data): data = data.sort_values(by=['probability']).reset_index(drop=True) custom_order = pd.Categorical(data['label'], categories=data.label) data = data.assign(label_custom=custom_order) p = ggplot(data, aes('label_custom', 'probability')) + \ geom_point(color = "#88aa88", size = 4) + \ geom_segment(aes(x = 'label_custom', y = 0, xend = 'label_custom', yend = 'probability'), color = "#88aa88") + \ coord_flip(expand=True) + \ theme_minimal() + \ labs(x="", y="probability", title = "Most Likely Object") + \ guides(title_position = "left") + \ theme(plot_title = element_text(size = 20, face = "bold", ha= "right")) fig = p.draw() figfile = BytesIO() plt.savefig(figfile, format='png', bbox_inches='tight') figfile.seek(0) # rewind to beginning of file figdata_png = base64.b64encode(figfile.getvalue()).decode() return p, figdata_png
def protobowl(fold=BUZZER_DEV_FOLD): df_rnn = pickle.load( open('output/buzzer/RNNBuzzer/{}_protobowl.pkl'.format(fold), 'rb')) df_rnn = df_rnn.groupby(['Possibility', 'Outcome']) df_rnn = df_rnn.size().reset_index().rename(columns={0: 'Count'}) df_rnn['Model'] = pd.Series(['RNN' for _ in range(len(df_rnn))], index=df_rnn.index) df_mlp = pickle.load( open('output/buzzer/MLPBuzzer/{}_protobowl.pkl'.format(fold), 'rb')) df_mlp = df_mlp.groupby(['Possibility', 'Outcome']) df_mlp = df_mlp.size().reset_index().rename(columns={0: 'Count'}) df_mlp['Model'] = pd.Series(['MLP' for _ in range(len(df_mlp))], index=df_mlp.index) df_thr = pickle.load( open('output/buzzer/ThresholdBuzzer/{}_protobowl.pkl'.format(fold), 'rb')) df_thr = df_thr.groupby(['Possibility', 'Outcome']) df_thr = df_thr.size().reset_index().rename(columns={0: 'Count'}) df_thr['Model'] = pd.Series(['Threshold' for _ in range(len(df_thr))], index=df_thr.index) df = df_rnn.append(df_mlp, ignore_index=True) df = df.append(df_thr, ignore_index=True) outcome_type = CategoricalDtype(categories=[15, 10, 5, 0, -5, -10, -15]) df['Outcome'] = df['Outcome'].astype(outcome_type) model_type = CategoricalDtype(categories=['Threshold', 'MLP', 'RNN']) df['Model'] = df['Model'].astype(model_type) p = (ggplot(df) + geom_col(aes(x='Possibility', y='Count', fill='Outcome'), width=0.7) + facet_grid('Model ~') + coord_flip() + theme_fs() + theme(aspect_ratio=0.17) + scale_fill_brewer(type='div', palette=7)) figure_dir = os.path.join('output/buzzer/{}_protobowl.pdf'.format(fold)) p.save(figure_dir)
def plot_cor(df): # drop missing correlations out = df[~df['corr'].isnull()] # add pair column out = out.assign(pair=out.col_1 + '&' + out.col_2) # add a sign column sign = ((out['corr'] > 0).astype('int')).to_list() sign = [['Negative', 'Positive'][i] for i in sign] out['sign'] = sign #out = out.sort_values('pair', ascending = False).reset_index(drop = True) # add ind column out['ind'] = [out.shape[0] - i for i in range(out.shape[0])] # plot using bands ggplt = p9.ggplot(data = out, mapping = p9.aes(x = 'pair', y = 'corr')) \ + p9.geom_hline( yintercept = 0, linetype = "dashed", color = "#c2c6cc" ) \ + p9.geom_rect( alpha = 0.4, xmin = out.ind.values - 0.4, xmax = out.ind.values + 0.4, ymin = out.lower.values, ymax = out.upper.values, fill = [['b', '#abaeb3'][int(x > 0.05)] for x in out.p_value] ) \ + p9.geom_segment( x = out.ind.values - 0.4, y = out['corr'].values, xend = out.ind.values + 0.4, yend = out['corr'].values ) \ + p9.coord_flip() \ + p9.ylim(np.min(out.lower.values), np.max(out.upper.values)) \ + p9.labs(x = "", y = "Correlation") return ggplt
g = (p9.ggplot(binned_df, p9.aes(x="precision", y="edges", color="in_hetionet")) + p9.geom_point() + p9.geom_line() + p9.scale_color_manual(values={ "Existing": color_map["Existing"], "Novel": color_map["Novel"] }) + p9.facet_wrap("relation") + p9.scale_y_log10() + p9.theme_bw()) print(g) # In[8]: g = (p9.ggplot(binned_df, p9.aes(x="precision", y="edges", fill="in_hetionet")) + p9.geom_bar(stat='identity', position='dodge') + p9.scale_fill_manual(values={ "Existing": color_map["Existing"], "Novel": color_map["Novel"] }) + p9.coord_flip() + p9.facet_wrap("relation") + p9.scale_y_log10() + p9.theme(figure_size=(12, 8), aspect_ratio=9) + p9.theme_bw()) print(g) # In[9]: combined_sen_tree = { "DaG": { "file": "../../../disease_gene/disease_associates_gene/edge_prediction_experiment/output/combined_predicted_dag_sentences.tsv.xz", "group": ["doid_id", "entrez_gene_id"] }, "CtD": { "file": "../../../compound_disease/compound_treats_disease/edge_prediction_experiment/output/combined_predicted_ctd_sentences.tsv.xz", "group": ["drugbank_id", "doid_id"]
def test_aesthetics_coordflip(self): assert self.p + coord_flip() == 'aesthetics+coord_flip'
# height=1, width=6) ## can also use seaborn for strip plotting... plt.close() # plt.figure(figsize=(6, 1)) sns.stripplot(data=gse75386, y='class', x='Gad1', color='black') # plt.savefig('gse75386_gad1_stripchart_bw.pdf', # format='pdf', bbox_inches='tight') ## ----------------------------------------------------------------- ## GSE75386 overplotted bars ## ----------------------------------------------------------------- plt.close() ggbar = ggplot(gse75386, gg.aes(x='class', y='Gad1')) ggbar += gg.geom_bar(alpha=0.1, position='identity', stat='identity') ggbar += gg.coord_flip() print(ggbar) # ggbar.save('gse75386_gad1_barchart_id.pdf', format='pdf', # height=1, width=6) ## ----------------------------------------------------------------- ## GSE75386 mean bars + SE lines ## ----------------------------------------------------------------- plt.close() ## use pandas functionality to compute stat transformations gse75386means = gse75386[['class', 'Gad1']]\ .groupby('class').agg(np.mean).iloc[:, 0] gse75386ses = gse75386[['class', 'Gad1']]\ .groupby('class').agg(lambda x: x.std() / np.sqrt(len(x)))\ .iloc[:, 0] gse75386stats = pd.DataFrame({
metadata_df["author_type"].value_counts() # # BioRxiv Research Article Categories # Categories assigned to each research article. Neuroscience dominates majority of the articles as expected. # In[9]: category_list = metadata_df.category.value_counts().index.tolist()[::-1] # plot nine doesn't implement reverse keyword for scale x discrete # ugh... g = ( p9.ggplot(metadata_df, p9.aes(x="category")) + p9.geom_bar(size=10, fill="#253494", position=p9.position_dodge(width=3)) + p9.scale_x_discrete(limits=category_list) + p9.coord_flip() + p9.theme_seaborn( context="paper", style="ticks", font="Arial", font_scale=1)) g.save("output/figures/preprint_category.png", dpi=500) print(g) # In[10]: metadata_df["category"].value_counts() # # New, Confirmatory, Contradictory Results? # In[11]: heading_list = metadata_df.heading.value_counts().index.tolist()[::-1]
]]) category_sim_df.head() # In[10]: category_sim_df.to_csv("output/category_cossim_95_ci.tsv", sep="\t", index=False) # In[11]: g = (p9.ggplot(category_sim_df) + p9.aes(x="category", y="pca1_cossim", ymin="pca1_cossim_lower", ymax="pca1_cossim_upper") + p9.geom_pointrange() + p9.coord_flip() + p9.theme_bw() + p9.scale_x_discrete(limits=category_sim_df.category.tolist()[::-1]) + p9.theme(figure_size=(11, 7), text=p9.element_text(size=12), panel_grid_major_y=p9.element_blank()) + p9.labs(y="PC1 Cosine Similarity")) g.save("output/pca_plots/figures/category_pca1_95_ci.svg", dpi=500) g.save("output/pca_plots/figures/category_pca1_95_ci.png", dpi=500) print(g) # In[12]: g = (p9.ggplot(category_sim_df) + p9.aes(x="category", y="pca2_cossim", ymax="pca2_cossim_upper", ymin="pca2_cossim_lower") +
["is_same_paper_1", "is_same_paper_2", "is_same_paper_3"]].mode(axis=1)))) final_annotated_df.head() # In[6]: binned_stats_df = (final_annotated_df.groupby( "distance_bin").final_same_paper.mean().to_frame().rename( index=str, columns={ "final_same_paper": "frac_correct" }).reset_index()) binned_stats_df # In[7]: g = (p9.ggplot(binned_stats_df, p9.aes(x="distance_bin", y="frac_correct")) + p9.geom_col(fill="#a6cee3") + p9.coord_flip() + p9.labs(x="Fraction Correct", y="Euclidean Distance Bins") + p9.theme_seaborn( context="paper", style="ticks", font="Arial", font_scale=1.5)) g.save("output/figures/distance_bin_accuracy.svg") g.save("output/figures/distance_bin_accuracy.png", dpi=250) print(g) # # Logsitic Regression Performance # In[8]: biorxiv_embed_df = (pd.read_csv(Path("../word_vector_experiment/output/") / "word2vec_output/" / "biorxiv_all_articles_300.tsv.xz", sep="\t").set_index("document"))
def test_coord_flip(): assert p + coord_flip() == 'coord_flip'
print(best_result) print("Best CV Fold") print(model.scores_["polka"][:, best_result[0]]) model.scores_["polka"][:, best_result[0]].mean() model_weights_df = pd.DataFrame.from_dict({ "weight": model.coef_[0], "pc": list(range(1, 51)), }) model_weights_df["pc"] = pd.Categorical(model_weights_df["pc"]) model_weights_df.head() g = (p9.ggplot(model_weights_df, p9.aes(x="pc", y="weight")) + p9.geom_col(position=p9.position_dodge(width=5), fill="#253494") + p9.coord_flip() + p9.scale_x_discrete(limits=list(sorted(range(1, 51), reverse=True))) + p9.theme_seaborn( context="paper", style="ticks", font_scale=1.1, font="Arial") + p9.theme(figure_size=(10, 8)) + p9.labs(title="Regression Model Weights", x="Princpial Component", y="Model Weight")) # g.save("output/figures/pca_log_regression_weights.svg") # g.save("output/figures/pca_log_regression_weights.png", dpi=250) print(g) fold_features = model.coefs_paths_["polka"].transpose(1, 0, 2) model_performance_df = pd.DataFrame.from_dict({ "feat_num": ((fold_features.astype(bool).sum(axis=1)) > 0).sum(axis=1), "C": model.Cs_,
def error_comparison(): char_frames = {} first_frames = {} full_frames = {} train_times = {} use_wiki = {} best_accuracies = {} for p in glob.glob(f'output/guesser/best/qanta.guesser*/guesser_report_guesstest.pickle', recursive=True): with open(p, 'rb') as f: report = pickle.load(f) name = report['guesser_name'] params = report['guesser_params'] train_times[name] = params['training_time'] use_wiki[name] = params['use_wiki'] if 'use_wiki' in params else False char_frames[name] = report['char_df'] first_frames[name] = report['first_df'] full_frames[name] = report['full_df'] best_accuracies[name] = (report['first_accuracy'], report['full_accuracy']) first_df = pd.concat([f for f in first_frames.values()]).sort_values('score', ascending=False).groupby(['guesser', 'qanta_id']).first().reset_index() first_df['position'] = ' Start' full_df = pd.concat([f for f in full_frames.values()]).sort_values('score', ascending=False).groupby(['guesser', 'qanta_id']).first().reset_index() full_df['position'] = 'End' compare_df = pd.concat([first_df, full_df]) compare_df = compare_df[compare_df.guesser != 'qanta.guesser.vw.VWGuesser'] compare_results = {} comparisons = ['qanta.guesser.dan.DanGuesser', 'qanta.guesser.rnn.RnnGuesser', 'qanta.guesser.elasticsearch.ElasticSearchGuesser'] cr_rows = [] for (qnum, position), group in compare_df.groupby(['qanta_id', 'position']): group = group.set_index('guesser') correct_guessers = [] wrong_guessers = [] for name in comparisons: if group.loc[name].correct == 1: correct_guessers.append(name) else: wrong_guessers.append(name) if len(correct_guessers) > 3: raise ValueError('this should be unreachable') elif len(correct_guessers) == 3: cr_rows.append({'qnum': qnum, 'Position': position, 'model': 'All', 'Result': 'Correct'}) elif len(correct_guessers) == 0: cr_rows.append({'qnum': qnum, 'Position': position, 'model': 'All', 'Result': 'Wrong'}) elif len(correct_guessers) == 1: cr_rows.append({ 'qnum': qnum, 'Position': position, 'model': to_shortname(correct_guessers[0]), 'Result': 'Correct' }) else: cr_rows.append({ 'qnum': qnum, 'Position': position, 'model': to_shortname(wrong_guessers[0]), 'Result': 'Wrong' }) cr_df = pd.DataFrame(cr_rows) # samples = cr_df[(cr_df.Position == ' Start') & (cr_df.Result == 'Correct') & (cr_df.model == 'RNN')].qnum.values # for qid in samples: # q = lookup[qid] # print(q['first_sentence']) # print(q['page']) # print() p = ( ggplot(cr_df) + aes(x='model', fill='Result') + facet_grid(['Result', 'Position']) #+ facet_wrap('Position', labeller='label_both') + geom_bar(aes(y='(..count..) / sum(..count..)'), position='dodge') + labs(x='Models', y='Fraction with Corresponding Result') + coord_flip() + theme_fs() + theme(aspect_ratio=.6) ) p.save('output/plots/guesser_error_comparison.pdf')