def plot_compare_accuracy(self, expo=False): if expo: return (ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy', fill='Dataset') + geom_bar(stat='identity', position='dodge') + xlab('Guessing Model') + ylab('Accuracy')) else: return (ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy') + geom_bar(stat='identity'))
def plot_cumulative_returns(wanted_stocks: Iterable[str], ld: LazyDictionary) -> p9.ggplot: df = ld["cip_df"] df = df.filter(wanted_stocks, axis=0).filter(regex="^\d", axis=1) dates = set(df.columns) movers = df movers["asx_code"] = movers.index movers = movers.melt(id_vars="asx_code", value_vars=dates) movers = movers[(movers["value"] < -5.0) | (movers["value"] > 5.0)] # ignore small movers # print(movers) movers["fetch_date"] = pd.to_datetime(movers["fetch_date"], format="%Y-%m-%d") # need to have separate dataframe's for positive and negative stocks - otherwise plotnine plot will be wrong #print(df) pos_df = df.agg([positive_sum]) neg_df = df.agg([negative_sum]) pos_df = pos_df.melt(value_vars=dates) neg_df = neg_df.melt(value_vars=dates) pos_df["fetch_date"] = pd.to_datetime(pos_df["fetch_date"], format="%Y-%m-%d") neg_df["fetch_date"] = pd.to_datetime(neg_df["fetch_date"], format="%Y-%m-%d") plot = (p9.ggplot() + p9.geom_bar( p9.aes(x="fetch_date", y="value"), data=pos_df, stat="identity", fill="green", ) + p9.geom_bar( p9.aes(x="fetch_date", y="value"), data=neg_df, stat="identity", fill="red", ) + p9.geom_point( p9.aes( x="fetch_date", y="value", fill="asx_code", ), data=movers, size=3, position=p9.position_dodge(width=0.4), colour="black", )) return user_theme( plot, y_axis_label="Cumulative Return (%)", legend_position="right", asxtrade_want_cmap_d=False, asxtrade_want_fill_d= True, # points (stocks) are filled with the user-chosen theme, but everything else is fixed )
def test_after_scale_mapping(): df = pd.DataFrame({'x': [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]}) df2 = pd.DataFrame({ # Same as above, but add 2 of each unique element 'x': [1, 2, 2, 3, 3, 3, 4, 4, 4, 4] + [1, 2, 3, 4] * 2 }) p = ggplot(df) + geom_bar(aes(x='x', ymax=after_scale('ymax + 2'))) p2 = ggplot(df2) + geom_bar(aes(x='x')) assert p + lims(y=(0, 7)) == 'after_scale_mapping' assert p2 + lims(y=(0, 7)) == 'after_scale_mapping'
def plot_compare_accuracy(self, expo=False): if expo: return ( ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy', fill='Dataset') + geom_bar(stat='identity', position='dodge') + xlab('Guessing Model') + ylab('Accuracy') ) else: return ( ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy') + geom_bar(stat='identity') )
def test_calculated_aes(): # after_stat('ae') mapping1 = aes('x', y=after_stat('density')) mapping2 = aes('x', y=after_stat('density*2')) mapping3 = aes('x', y=after_stat('density + count')) mapping4 = aes('x', y=after_stat('func(density)')) def _test(): assert list(mapping1._calculated.keys()) == ['y'] assert list(mapping2._calculated.keys()) == ['y'] assert list(mapping3._calculated.keys()) == ['y'] assert list(mapping4._calculated.keys()) == ['y'] assert mapping1['y'].after_stat == 'density' assert mapping2['y'].after_stat == 'density*2' assert mapping3['y'].after_stat == 'density + count' assert mapping4['y'].after_stat == 'func(density)' assert mapping1._calculated['y'] == 'density' assert mapping2._calculated['y'] == 'density*2' assert mapping3._calculated['y'] == 'density + count' assert mapping4._calculated['y'] == 'func(density)' _test() # 'stat(ae)', DEPRECATED but still works mapping1 = aes('x', y='stat(density)') mapping2 = aes('x', y='stat(density*2)') mapping3 = aes('x', y='stat(density + count)') mapping4 = aes('x', y='stat(func(density))') _test() # '..ae..', DEPRECATED but still works mapping1 = aes('x', y='..density..') mapping2 = aes('x', y='..density..*2') mapping3 = aes('x', y='..density.. + ..count..') mapping4 = aes('x', y='func(..density..)') _test() df = pd.DataFrame({'x': [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]}) p = ggplot(df) + geom_bar(aes(x='x', fill=after_stat('count + 2'))) p.draw_test() p = ggplot(df) + geom_bar(aes(x='x', fill='stat(count + 2)')) p.draw_test() p = ggplot(df) + geom_bar(aes(x='x', fill='..count.. + 2')) p.draw_test()
def bsuite_bar_plot(df_in: pd.DataFrame, sweep_vars: Sequence[str] = None) -> gg.ggplot: """Output bar plot of bsuite data.""" df = _clean_bar_plot_data(df_in, sweep_vars) p = (gg.ggplot(df) + gg.aes(x='env', y='score', colour='type', fill='type') + gg.geom_bar(position='dodge', stat='identity') + gg.geom_hline(yintercept=1., linetype='dashed', alpha=0.5) + gg.scale_colour_manual(plotting.CATEGORICAL_COLOURS) + gg.scale_fill_manual(plotting.CATEGORICAL_COLOURS) + gg.xlab('experiment') + gg.theme(axis_text_x=gg.element_text(angle=25, hjust=1)) ) if not all(df.finished): # add a layer of alpha for unfinished jobs p += gg.aes(alpha='finished') p += gg.scale_alpha_discrete(range=[0.3, 1.0]) # Compute the necessary size of the plot if sweep_vars: p += gg.facet_wrap(sweep_vars, labeller='label_both', ncol=1) n_hypers = df[sweep_vars].drop_duplicates().shape[0] else: n_hypers = 1 return p + gg.theme(figure_size=(14, 3 * n_hypers + 1))
def plot_mem(df): x = df.copy() # initialise some extra columns useful for plotting x['new_cols'] = [str(i) for i in x['col_name']] x['new_cols'] = pd.Categorical(x['new_cols'], categories=x['new_cols'], ordered=True) x['cnt_print_loc_pos'] = (x.pcnt.values) + (np.max(x.pcnt.values)) / 70 x['cnt_print_loc_neg'] = (x.pcnt.values) - (np.max(x.pcnt.values)) / 70 # build basic plot ggplt = p9.ggplot(x, p9.aes(x = 'new_cols', y = 'pcnt', fill = 'new_cols')) \ + p9.geom_bar(stat = 'identity') \ + p9.guides(fill = False) \ + p9.ylab('% of total size') \ + p9.xlab('') \ + p9.theme(axis_text_x=p9.element_text(rotation = 45, hjust=1)) # add text labels to the highest bars y1 = x.copy()[x.pcnt > 0.3 * np.max(x.pcnt)] ggplt = ggplt + \ p9.geom_text(p9.aes(x = 'new_cols', y = 'cnt_print_loc_neg', label = 'size', \ fill = 'col_name'), inherit_aes = False, data = y1, color = 'white', \ angle = 90, vjust = 'top') # add text labels to the lower bars y2 = x.copy()[x.pcnt <= 0.3 * np.max(x.pcnt)] ggplt = ggplt + \ p9.geom_text(p9.aes(x = 'new_cols', y = 'cnt_print_loc_pos', label = 'size', \ fill = 'col_name'), inherit_aes = False, data = y2, color = 'gray', \ angle = 90, vjust = 'bottom') return ggplt
def plot_data_point(self, data_point_ix, use_base=True, figure_size=(8, 6)): """ Plot Shapley values for an individual data point Parameters ---------- data_point_ix : int use_base : boolean, optional default=True Returns ------- g : ggplot object """ # Check Shapley values exist if self._shapley_values is None: raise Exception("No Shapley values are available") d = self.get_shapley_values().loc[[data_point_ix]] if not use_base: d = d.drop("BASE", axis=1) g = (ggplot(d.reset_index(drop=False).melt( id_vars="index"), aes(x="variable", y="value", fill="variable")) + geom_bar(stat="identity") + labs(title="Shapley values (Index: " + str(data_point_ix) + ")", x="Feature", y="Shapley value", fill="Feature") + coord_flip()) g += theme(figure_size=figure_size) return g
def cell_cycle_phase_barplot(adata, palette='Set2'): """Plots the proportion of cells in each phase of the cell cycle See also: cell_cycle_phase_pieplot for the matplotlib pie chart Parameters ----------- adata: AnnData The AnnData object being used for the analysis. Must be previously evaluated by `tl.annotate_cell_cycle`. Returns ----------- A plotnine barplot with the total counts of cell in each phase of the cell cycle. """ plt_data = adata.obs.copy() plt_data['cell_cycle_phase'] = pd.Categorical( plt_data['cell_cycle_phase'], categories=['G1 post-mitotic', 'G1 pre-replication', 'S/G2/M']) cycle_plot = ( ggplot(plt_data, aes('cell_cycle_phase', fill='cell_cycle_phase')) + geom_bar() + coord_flip() + guides(fill=False) + labs(y='', x='Cell cycle phase') + theme_light() + theme(panel_grid_major_y=element_blank(), panel_grid_minor_y=element_blank(), panel_grid_major_x=element_line(size=1.5), panel_grid_minor_x=element_line(size=1.5)) + scale_fill_brewer(type='qual', palette=palette)) return cycle_plot
def plot_breakdown(cip_df: pd.DataFrame): """Stacked bar plot of increasing and decreasing stocks per sector in the specified df""" cols_to_drop = [colname for colname in cip_df.columns if colname.startswith('bin_')] df = cip_df.drop(columns=cols_to_drop) df = pd.DataFrame(df.sum(axis='columns'), columns=['sum']) df = df.merge(stocks_by_sector(), left_index=True, right_on='asx_code') if len(df) == 0: # no stock in cip_df have a sector? ie. ETF? return None assert set(df.columns) == set(['sum', 'asx_code', 'sector_name']) df['increasing'] = df.apply(lambda row: 'up' if row['sum'] >= 0.0 else 'down', axis=1) sector_names = df['sector_name'].value_counts().index.tolist() # sort bars by value count (ascending) sector_names_cat = pd.Categorical(df['sector_name'], categories=sector_names) df = df.assign(sector_name_cat=sector_names_cat) #print(df) plot = ( p9.ggplot(df, p9.aes(x='factor(sector_name_cat)', fill='factor(increasing)')) + p9.geom_bar() + p9.labs(x="Sector", y="Number of stocks") + p9.theme(axis_text_y=p9.element_text(size=7), subplots_adjust={"left": 0.2, 'right': 0.85}, legend_title=p9.element_blank() ) + p9.coord_flip() ) return plot_as_inline_html_data(plot)
def test_stat_count_float(): df = pd.DataFrame({'x': ['a', 'b'], 'weight': [1.5, 2.5]}) p = (ggplot(df) + aes(x='x', weight='weight', fill='x') + geom_bar() + geom_text(aes(label=after_stat('count')), stat='count')) assert p + _theme == 'stat-count-float'
def plot_vs_discrete(data_table, discrete_metric_name, metric_name, segment_name, title, ylim=None, aggregate="mean" ): data_filtered = \ data_table.loc[((pd.notnull(data_table[metric_name])) & (pd.notnull(data_table[discrete_metric_name])))][ [discrete_metric_name, metric_name, segment_name]] data_filtered[[metric_name]] = data_filtered[[metric_name]].astype(float) result = data_filtered.groupby([discrete_metric_name, segment_name]).agg({metric_name: aggregate}).reset_index() result[metric_name] = round(result[metric_name], 3) gg_result = plot.ggplot(result) + plot.aes(x=discrete_metric_name, y=metric_name, fill=segment_name, label=metric_name ) + \ plot.geom_bar(stat="identity", position="dodge") + \ plot.geom_text(position=plot.position_dodge(width=.9), size=8) + \ plot.labs(x=discrete_metric_name, y=aggregate + "(" + metric_name + ")", title=title) if pd.notnull(ylim): gg_result = gg_result + plot.ylim(ylim) return gg_result
def test_stat_count_int(): df = pd.DataFrame({'x': ['a', 'b'], 'weight': [1, 2]}) p = (ggplot(df) + aes(x='x', weight='weight', fill='x') + geom_bar() + geom_text(aes(label='stat(count)'), stat='count')) assert p + _theme == 'stat-count-int'
def plot_downstream(clwe, table, output, ylim): df = pd.read_csv(data_file(table)) df = df[df.clwe == clwe] df = df.assign( refine=pd.Categorical(df['refine'], ['Original', '+retrofit', '+synthetic']), language=pd.Categorical(df['language'], ['DE', 'ES', 'FR', 'IT', 'JA', 'RU', 'ZH', 'AVG']) ) g = p9.ggplot(df, p9.aes(x='language', y='accuracy', fill='refine')) g += p9.geom_bar(position='dodge', stat='identity', width=.8) g += p9.coord_cartesian(ylim=ylim) g += p9.scale_fill_manual(['#999999', '#EA5F94', '#FFB14E']) g += p9.theme_void(base_size=FONT_SIZE, base_family='Arial') g += p9.theme( plot_background=p9.element_rect(fill='white'), panel_grid_major_y=p9.element_line(), axis_text_x=p9.element_text(margin={'t': 10}), axis_text_y=p9.element_text(margin={'r': 8}), legend_position=(.7, .9), legend_direction='horizontal', legend_title=p9.element_blank(), legend_text=p9.element_text(size=FONT_SIZE), legend_box_margin=0, figure_size=(12, 3) ) g.save(filename=output_file(output))
def plot_trend(sample_period="M", ld: LazyDictionary = None) -> str: """ Given a dataframe of a single stock from company_prices() this plots the highest price in each month over the time period of the dataframe. """ assert "stock_df" in ld def inner_date_fmt(dates_to_format): results = [] for d in dates_to_format: d -= timedelta( weeks=4 ) # breaks are set to the end of the month rather than the start... so results.append(d.strftime("%Y-%m")) return results stock_df = ld["stock_df"] # print(stock_df) dataframe = stock_df.filter(items=["last_price"]) dataframe.index = pd.to_datetime(dataframe.index, format="%Y-%m-%d") dataframe = dataframe.resample(sample_period).max() # print(dataframe) plot = ( p9.ggplot( dataframe, p9.aes(x="dataframe.index", y=dataframe.columns[0], fill=dataframe.columns[0]), ) + p9.geom_bar(stat="identity", alpha=0.7) + p9.scale_x_datetime( labels=inner_date_fmt ) # dont print day (always 1st day of month due to resampling) ) return user_theme(plot, y_axis_label="$ AUD", asxtrade_want_fill_continuous=True)
def test_coord_trans_reverse(): # coord trans can reverse continous and discrete data p = (ggplot(df, aes('factor(x)')) + geom_bar(aes(fill='factor(z)'), show_legend=False) + coord_trans(x='reverse', y='reverse') ) assert p == 'coord_trans_reverse'
def summary(tags, opts=None): print(tags) tags_summary = ( tags.groupby(["tag", "background"]) .agg({"tag": "count"}) .rename(columns={"tag": "n_tags"}) .reset_index() .astype({"background": "category", "tag": "category"}) ) print(tags_summary) # tags_summary = tags_df.groupby(["species"]).agg( # {"tag_duration": "sum", "species": "count"} # ) # tags_summary.rename(columns={"species": "count"}, inplace=True) # tags_summary["tag_duration"] = tags_summary.tag_duration.astype(int) # tags_summary["duration"] = tags_summary.tag_duration.astype(str) + "s" # tags_summary = tags_summary.reindex(list(SPECIES_LABELS.keys())) # # tags_summary["species"] = tags_summary.index # tags_summary.reset_index(inplace=True) # tags_summary # ( # ggplot( # data=tags_summary, # mapping=aes( # x="factor(species, ordered=False)", # y="tag_duration", # fill="factor(species, ordered=False)", # ), # ) # + geom_bar(stat="identity", show_legend=False) # + xlab("Species") # + ylab("Duration of annotations (s)") # + geom_text(mapping=aes(label="count"), nudge_y=15) # + theme_classic() # + scale_x_discrete(limits=SPECIES_LIST, labels=xlabels) # ).save("species_repartition_duration_mini.png", width=10, height=8) plt = ( ggplot( data=tags_summary, mapping=aes( x="tag", # "factor(species, ordered=False)", y="n_tags", fill="background", # "factor(species, ordered=False)", ), ) + geom_bar(stat="identity", show_legend=True, position=position_dodge()) + xlab("Species") + ylab("Number of annotations") + geom_text(mapping=aes(label="n_tags"), nudge_y=15) + theme_classic() + theme(axis_text_x=element_text(angle=90, vjust=1, hjust=1, margin={"r": -30})) # + scale_x_discrete(limits=SPECIES_LIST, labels=xlabels) ).save("tag_species_bg.png", width=10, height=8) # print(tags_summary) print(plt)
def plot_point_scores(stock: str, sector_companies, all_stocks_cip: pd.DataFrame, rules): """ Visualise the stock in terms of point scores as described on the stock view page. Rules to apply can be specified by rules (default rules are provided by rule_*()) Points are lost for equivalent downturns and the result plotted. All rows in all_stocks_cip will be used to calculate the market average on a given trading day, whilst only sector_companies will be used to calculate the sector average. A utf-8 base64 encoded plot image is returned """ assert len(stock) >= 3 assert all_stocks_cip is not None assert rules is not None and len(rules) > 0 rows = [] points = 0 day_low_high_df = day_low_high(stock, all_dates=all_stocks_cip.columns) state = { "day_low_high_df": day_low_high_df, # never changes each day, so we init it here "all_stocks_change_in_percent_df": all_stocks_cip, "stock": stock, "daily_range_threshold": 0.20, # 20% at either end of the daily range gets a point } net_points_by_rule = defaultdict(int) for date in all_stocks_cip.columns: market_avg = all_stocks_cip[date].mean() sector_avg = all_stocks_cip[date].filter(items=sector_companies).mean() stock_move = all_stocks_cip.at[stock, date] state.update( { "market_avg": market_avg, "sector_avg": sector_avg, "stock_move": stock_move, "date": date, } ) points += sum(map(lambda r: r(state), rules)) for r in rules: k = r.__name__ if k.startswith("rule_"): k = k[5:] net_points_by_rule[k] += r(state) rows.append({"points": points, "stock": stock, "date": date}) df = pd.DataFrame.from_records(rows) df["date"] = pd.to_datetime(df["date"]) point_score_plot = plot_series(df, x="date", y="points") rows = [] for k, v in net_points_by_rule.items(): rows.append({"rule": str(k), "net_points": v}) df = pd.DataFrame.from_records(rows) net_rule_contributors_plot = ( p9.ggplot(df, p9.aes(x="rule", y="net_points")) + p9.labs(x="Rule", y="Contribution to points by rule") + p9.geom_bar(stat="identity") + p9.theme(axis_text_y=p9.element_text(size=7), subplots_adjust={"left": 0.2}) + p9.coord_flip() ) return point_score_plot, plot_as_inline_html_data(net_rule_contributors_plot)
def plot_compare_accuracy(self, expo=False): if expo: return ( ggplot(self.acc_df) + facet_wrap("position") + aes(x="guesser", y="accuracy", fill="Dataset") + geom_bar(stat="identity", position="dodge") + xlab("Guessing Model") + ylab("Accuracy") ) else: return ( ggplot(self.acc_df) + facet_wrap("position") + aes(x="guesser", y="accuracy") + geom_bar(stat="identity") )
def duration_graph(Data, Data_m): print('======= Creating duration_graph =======') #Filter current year and month, and correct Duration #Graph2_ALL.Duration = Graph2_ALL.Duration/60 #Graph2_ALL.Duration = Graph2_ALL.Duration.astype(str) x = Data.Duration[pd.isna(Data.Duration) == True] if (len(x) == len(Data)): logging.warning('=================================Graph_2 aborted =============================') return else: Graph2 = Data_m[(Data_m.Duration < 180)] Graph2_ALL = Data[(Data.Duration < 180)] if (len(Graph2_ALL) > 0): plot= (p9.ggplot(data=Graph2_ALL, mapping=p9.aes(x='Duration')) + p9.geom_bar(fill = 'red', stat = 'count', size = 100) + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size=40), axis_title = p9.element_text(size = 40,face = 'bold')) + p9.labs(title = '', x='',y='No. of attacks') ) plot.save(filename = 'Graph_ALL_2.jpeg',plot = plot, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') if (len(Graph2) > 0): plot_month= (p9.ggplot(data=Graph2, mapping=p9.aes(x='Duration')) + p9.geom_bar(fill = 'red', stat = 'count', size = 100) + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size=40), axis_title = p9.element_text(size = 40,face = 'bold')) + p9.labs(title = '', x='',y='No. of attacks') ) plot_month.save(filename = 'Graph_2.jpeg', plot = plot_month, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') return(print('=================================duration_graph DONE ============================='))
def test_discrete_xy_scale_drop_limits(): df = pd.DataFrame({ 'x': list('aaaabbbbccccddd'), 'c': list('112312231233123') }) p = (ggplot(df) + geom_bar(aes(x='x', fill='c')) + scale_x_discrete(limits=list('abc'))) assert p == 'discrete_xy_scale_drop_limits'
def make_bar_plot(data,x,y): """ Make a bar plot between two variables data[x] and data[y] """ (p9.ggplot(data=data, mapping=p9.aes(x=x, y=y)) + p9.geom_bar(stat='identity')) + p9.theme(axis_text_x=p9.element_text(angle=90)) + p9.labs(title='{} By {}'.format(x,y)));
def make_single_bar_chart_multi_year(survey_data, column, facet, proportionally=False): """Make a barchart showing the number of respondents responding to a single column. Bars are colored by which year of the survey they correspond to. If facet is not empty, the resulting plot will be faceted into subplots by the variables given. Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey column (str): Column to plot responses to facet (list,optional): List of columns use for grouping proportionally (bool, optiona ): Defaults to False. If True, the bars heights are determined proportionally to the total number of responses in that facet. Returns: (plotnine.ggplot): Plot object which can be displayed in a notebook or saved out to a file """ cols = [column, facet] show_legend = False topic_data = survey_data[cols + ["year"]] topic_data_long = make_long(topic_data, facet, multi_year=True) if proportionally: proportions = ( topic_data_long[topic_data_long.rating == 1].groupby(facet + ["year"]).sum() / topic_data_long.groupby(facet + ["year"]).sum() ).reset_index() else: proportions = ( topic_data_long[topic_data_long.rating == 1] .groupby(facet + ["year"]) .count() .reset_index() ) x = topic_data_long.columns.tolist() x.remove("level_1") ## Uncomment to return dataframe instead of plot # return proportions return ( p9.ggplot(proportions, p9.aes(x=facet, fill="year", y="level_1")) + p9.geom_bar(show_legend=show_legend, stat="identity") + p9.theme( axis_text_x=p9.element_text(angle=45, ha="right"), strip_text_y=p9.element_text(angle=0, ha="left"), ) + p9.scale_x_discrete( limits=topic_data_long[facet].unique().tolist(), labels=[ x.replace("_", " ") for x in topic_data_long[facet].unique().tolist() ], ) )
def plot_grades(dat): import plotnine as p9 p = { p9.ggplot(dat, p9.aes('grade')) + p9.geom_bar() + p9.facet_wrap('keywords') } return p
def test_dodge_preserve_single_text(): df1 = pd.DataFrame({'x': ['a', 'b', 'b', 'b'], 'y': ['a', 'a', 'b', 'b']}) d = position_dodge(preserve='single', width=0.9) p = (ggplot(df1, aes('x', fill='y')) + geom_bar(position=d) + geom_text(aes(y=after_stat('count'), label=after_stat('count')), stat='count', position=d, va='bottom')) assert p + _theme == 'dodge_preserve_single_text'
def plotfreq(freqdf): ''' ---------- Parameters ---------- freqdf dataframe generated by freq() Returns ------- Bar chart with frequencies & percentages in descending order Example ------- import exploretransform as et df, X, y = et.loadboston() et.plotfreq(et.freq(X['town'])) Warning ------- This function will likely not plot more than 100 unique levels properly. ---------- ''' # input checks if isinstance(freqdf, (pd.core.frame.DataFrame)): pass else: return print("\nFunction only accetps dataframes\n") if len(freqdf.columns) == 4: pass else: return print("\nInput must be a dataframe generated by freq()\n") if sum(freqdf.columns[1:4] == ['freq', 'perc', 'cump']) == 3: pass else: return print("\nInput must be a dataframe generated by freq()\n") if len(freqdf) < 101: pass else: return print("\nUnable to plot more than 100 items") # label for plot lbl = freqdf['freq'].astype(str).str.cat( '[ ' + freqdf['perc'].astype(str) + '%' + ' ]', sep=' ') # create variable to be used in aes aesx = 'reorder(' + freqdf.columns[0] + ', freq)' # build plot plot = (pn.ggplot(freqdf) + pn.aes(x=aesx, y='freq', fill='freq', label=lbl) + pn.geom_bar(stat='identity') + pn.coord_flip() + pn.theme(axis_text_y=pn.element_text(size=6, weight='bold'), legend_position='none') + pn.labs(x=freqdf.columns[0], y="Freq") + pn.scale_fill_gradient2(mid='bisque', high='blue') + pn.geom_text(size=6, nudge_y=.7)) return plot
def plot_rank_full(df, plot_fn): f = (p9.ggplot(df, p9.aes(x="emotion_cat", y="ratio", fill="factor(rank)")) + p9.geom_bar(stat="identity") + p9.facet_wrap("cluster_labels_6") + p9.labs(x="Model", y="Proportion (%)", fill="Rank") + p9.theme_538() + p9.theme(legend_position="top", legend_direction="horizontal", figure_size=(10, 5)) + p9.theme(plot_background=p9.element_rect( fill=BG_COLOR, color=BG_COLOR, size=1), axis_text_x=p9.element_text(rotation=45, hjust=1))) f.save(plot_fn)
def create(self, file_path: str) -> None: (ggplot(self._data, aes(x="pattern", y="count", label="fraction")) + geom_bar(stat="identity", fill="#1e4f79") + geom_text(va='bottom', size=24, format_string='{:.1%}') + scale_x_discrete(limits=self._data["pattern"]) + scale_y_continuous(labels=comma_format(), expand=[0.1, 0]) + ggtitle("Design Pattern Counts") + xlab("Design Pattern") + ylab("Count") + theme_classic(base_size=32, base_family="Helvetica") + theme(text=element_text(size=32), axis_text_x=element_text(rotation=45, ha="right"))).save( file_path, width=24, height=8)
def create(self, file_path: str) -> None: (ggplot(self._data, aes(x="count", label="..count..")) + geom_bar(fill="#1e4f79") + geom_text(stat="count", va='bottom', size=24) + scale_x_discrete(limits=[ "1", "2", "3", "5", "26", "52", "97", "100", "300", "537" ]) + scale_y_continuous(breaks=[0, 5, 10], limits=[0, 10]) + ggtitle("Case Study Sizes") + xlab("Number of Projects") + ylab("Number of Case Studies") + theme_classic(base_size=28, base_family="Helvetica") + theme(text=element_text(size=28))).save(file_path, width=14, height=7)
def test_calculated_aes(): _strip = strip_calculated_markers # stat(ae) mapping1 = aes('x', y='stat(density)') mapping2 = aes('x', y='stat(density*2)') mapping3 = aes('x', y='stat(density + count)') mapping4 = aes('x', y='func(stat(density))') assert get_calculated_aes(mapping1) == ['y'] assert get_calculated_aes(mapping2) == ['y'] assert get_calculated_aes(mapping3) == ['y'] assert get_calculated_aes(mapping4) == ['y'] assert _strip(mapping1['y']) == 'density' assert _strip(mapping2['y']) == 'density*2' assert _strip(mapping3['y']) == 'density + count' assert _strip(mapping4['y']) == 'func(density)' # ..ae.. mapping1 = aes('x', y='..density..') mapping2 = aes('x', y='..density..*2') mapping3 = aes('x', y='..density.. + ..count..') mapping4 = aes('x', y='func(..density..)') assert get_calculated_aes(mapping1) == ['y'] assert get_calculated_aes(mapping2) == ['y'] assert get_calculated_aes(mapping3) == ['y'] assert get_calculated_aes(mapping4) == ['y'] assert _strip(mapping1['y']) == 'density' assert _strip(mapping2['y']) == 'density*2' assert _strip(mapping3['y']) == 'density + count' assert _strip(mapping4['y']) == 'func(density)' df = pd.DataFrame({'x': [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]}) p = ggplot(df) + geom_bar(aes(x='x', fill='stat(count + 2)')) p.draw_test() p = ggplot(df) + geom_bar(aes(x='x', fill='..count.. + 2')) p.draw_test()
def plot_outcome_counts(read_file_1, read_file_2, save_file, plot_dir): temp_sub = pd.read_csv(os.path.join(dir_output, read_file_1)) temp_agg = pd.read_csv(os.path.join(dir_output, read_file_2)) temp_sub = recode_outcome(temp_sub) temp_agg = recode_outcome(temp_agg) plot_output = os.path.join(dir_figures, plot_dir) dat = pd.concat([temp_agg, temp_sub], axis=0).reset_index(drop=True) dat = dat.groupby(['outcome', 'model']).size().reset_index(name='counts') img = (ggplot(dat, aes(x='outcome', y='counts', fill='model')) + geom_bar(stat='identity', position='dodge')) + labs( x='Outcome', y='Counts') + theme_bw() img.save(os.path.join(plot_output, save_file))
def test_removes_infinite_values(): df = mtcars.copy() df.loc[[0, 5], 'wt'] = [np.inf, -np.inf] p = ggplot(df, aes(x='wt')) + geom_bar() with pytest.warns(UserWarning) as record: p._build() def removed_2_row_with_infinites(record): for item in record: msg = str(item.message).lower() if '2 rows' in msg and 'non-finite' in msg: return True return False assert removed_2_row_with_infinites(record)
def test_dodge(): p = (ggplot(df2, aes('factor(z)')) + geom_bar(aes(fill='factor(x)'), position='dodge')) assert p + _theme == 'dodge'
gradient = ( (0.99, 0.88, 0.87), (0.98, 0.62, 0.71), (0.86, 0.20, 0.59), bcolor, bcolor, bcolor_darker, bcolor_darker) df1 = df[:n//3:9] df2 = df[n//3:2*n//3] df3 = df[2*n//3::12] p = (ggplot(aes('x', 'y', color='y', fill='y')) + annotate(geom='label', x=0.295, y=0.495, label='pl tnine', label_size=1.5, label_padding=.1, size=24, fill=bcolor_lighter, color=bcolor) + geom_point(df1, size=8, stroke=0, show_legend=False) + geom_line(df2, size=2, color=bcolor_darker, show_legend=False) + geom_bar(df3, aes('x+.06'), stat='identity', size=0, show_legend=False) + scale_color_gradientn(colors=gradient) + scale_fill_gradientn(colors=gradient) + theme_void() + theme(figure_size=(3.6, 3.6))) p.save('logo.pdf', pad_inches=-0.04) # Remove the project name p.layers = p.layers.__class__(p.layers[1:]) p.save('logo-small.pdf', pad_inches=-0.04)
from __future__ import absolute_import, division, print_function import numpy as np import pandas as pd from mizani.transforms import trans_new from plotnine import (ggplot, aes, geom_bar, coord_flip, coord_fixed, coord_trans) n = 10 # Some even number greater than 2 # ladder: 0 1 times, 1 2 times, 2 3 times, ... df = pd.DataFrame({'x': np.repeat(range(n+1), range(n+1)), 'z': np.repeat(range(n//2), range(3, n*2, 4))}) p = (ggplot(df, aes('x')) + geom_bar(aes(fill='factor(z)'), show_legend=False)) def test_coord_flip(): assert p + coord_flip() == 'coord_flip' def test_coord_fixed(): assert p + coord_fixed(0.5) == 'coord_fixed' def test_coord_trans(): double_trans = trans_new('double', np.square, np.sqrt) assert p + coord_trans(y=double_trans) == 'coord_trans'
def test_calculated_expressions(): p = (ggplot(mtcars, aes(x='factor(cyl)', y='..count..+1')) + geom_bar()) # No exception p._build()
def test_dodge_preserve_single(): df1 = pd.DataFrame({'x': ['a', 'b', 'b'], 'y': ['a', 'a', 'b']}) p = (ggplot(df1, aes('x', fill='y')) + geom_bar(position=position_dodge(preserve='single'))) assert p + _theme == 'dodge_preserve_single'
def error_comparison(): char_frames = {} first_frames = {} full_frames = {} train_times = {} use_wiki = {} best_accuracies = {} for p in glob.glob(f'output/guesser/best/qanta.guesser*/guesser_report_guesstest.pickle', recursive=True): with open(p, 'rb') as f: report = pickle.load(f) name = report['guesser_name'] params = report['guesser_params'] train_times[name] = params['training_time'] use_wiki[name] = params['use_wiki'] if 'use_wiki' in params else False char_frames[name] = report['char_df'] first_frames[name] = report['first_df'] full_frames[name] = report['full_df'] best_accuracies[name] = (report['first_accuracy'], report['full_accuracy']) first_df = pd.concat([f for f in first_frames.values()]).sort_values('score', ascending=False).groupby(['guesser', 'qanta_id']).first().reset_index() first_df['position'] = ' Start' full_df = pd.concat([f for f in full_frames.values()]).sort_values('score', ascending=False).groupby(['guesser', 'qanta_id']).first().reset_index() full_df['position'] = 'End' compare_df = pd.concat([first_df, full_df]) compare_df = compare_df[compare_df.guesser != 'qanta.guesser.vw.VWGuesser'] compare_results = {} comparisons = ['qanta.guesser.dan.DanGuesser', 'qanta.guesser.rnn.RnnGuesser', 'qanta.guesser.elasticsearch.ElasticSearchGuesser'] cr_rows = [] for (qnum, position), group in compare_df.groupby(['qanta_id', 'position']): group = group.set_index('guesser') correct_guessers = [] wrong_guessers = [] for name in comparisons: if group.loc[name].correct == 1: correct_guessers.append(name) else: wrong_guessers.append(name) if len(correct_guessers) > 3: raise ValueError('this should be unreachable') elif len(correct_guessers) == 3: cr_rows.append({'qnum': qnum, 'Position': position, 'model': 'All', 'Result': 'Correct'}) elif len(correct_guessers) == 0: cr_rows.append({'qnum': qnum, 'Position': position, 'model': 'All', 'Result': 'Wrong'}) elif len(correct_guessers) == 1: cr_rows.append({ 'qnum': qnum, 'Position': position, 'model': to_shortname(correct_guessers[0]), 'Result': 'Correct' }) else: cr_rows.append({ 'qnum': qnum, 'Position': position, 'model': to_shortname(wrong_guessers[0]), 'Result': 'Wrong' }) cr_df = pd.DataFrame(cr_rows) # samples = cr_df[(cr_df.Position == ' Start') & (cr_df.Result == 'Correct') & (cr_df.model == 'RNN')].qnum.values # for qid in samples: # q = lookup[qid] # print(q['first_sentence']) # print(q['page']) # print() p = ( ggplot(cr_df) + aes(x='model', fill='Result') + facet_grid(['Result', 'Position']) #+ facet_wrap('Position', labeller='label_both') + geom_bar(aes(y='(..count..) / sum(..count..)'), position='dodge') + labs(x='Models', y='Fraction with Corresponding Result') + coord_flip() + theme_fs() + theme(aspect_ratio=.6) ) p.save('output/plots/guesser_error_comparison.pdf')