def scale_x_datetime_auto(times: pd.Series, figsize=(12,10)): """ Automatically set breaks and format based on duration of series """ width = figsize[0] dt = (times.iloc[-1] - times.iloc[0]).total_seconds() mins = dt / 60 hours = dt / 3600 days = dt / 3600 / 24 if days > 10: fmt = "%Y-%m-%d" breaks = "%1.0f days" % max(np.round(days / (width / 2.0)), 1.0) return plotnine.scale_x_datetime(date_labels=fmt, date_breaks=breaks) elif days > 1.0: fmt = "%Y-%m-%d %H:%M" breaks = "%1.0f hours" % max(np.round(hours / (width / 2.0)), 1.0) return plotnine.scale_x_datetime(date_labels=fmt, date_breaks=breaks) elif hours > 1: fmt = "%H:%M" breaks = "%1.0f minutes" % max(np.round(mins / (width / 1.5)), 1.0) return plotnine.scale_x_datetime(date_labels=fmt, date_breaks=breaks) else: fmt = "%H:%M:%S" breaks = "%1.0f minutes" % max(np.round(mins / (width / 1.5)), 1.0) return plotnine.scale_x_datetime(date_labels=fmt, date_breaks=breaks)
def event_counts_date(request_disc=None): ''' Plot the average timeline of a certain institution request should be given as a dictionary ''' request = np.ones(df.shape[0], dtype=bool) for key in request_disc.keys(): if key == "institution": request = request & (df[key].str.contains(request_disc[key])) else: request = request & (df[key] == request_disc[key]) df_selected = df[request] df_selected["date_md"] = df_selected["admission_date"].apply( lambda dt: dt.replace(year=1980)) df_selected["year"] = df_selected["admission_date"].apply( lambda dt: dt.year) samp = df[request].iloc[0] title = "" for key in request_disc.keys(): title += samp[key] title += " " gg = p9.ggplot(df_selected) gg += p9.aes(x="date_md", y="admission_status") gg += p9.scale_x_datetime(date_breaks='10 days', date_labels="%m-%d", limits=np.array([ np.min(df_selected["date_md"]), pd.to_datetime("1980-4-20") ])) gg += p9.geom_count() gg += p9.ggtitle(title) return gg
def plot_frequency(n = 200): """ Draws the histogram of the distribution of n tweets by date. Parameters ---------- n: int An integer specifying how many tweets should be analysed. Returns ------- It saves the histogram as a .png file in the static folder. """ from plotnine import ggplot, aes, geom_histogram, scale_x_datetime, labs, theme_minimal, ggsave from Mod_1_API import gather_tweets from mizani.breaks import date_breaks from mizani.formatters import date_format import pandas df = pandas.DataFrame(gather_tweets(n)) plot1 = (ggplot(df, aes(x = 'Date', fill = 'Author')) + geom_histogram() + scale_x_datetime(breaks=date_breaks('1 week')) + labs(x = "Time in weeks", y = "Number of tweets by source") + theme_minimal() ) ggsave(plot = plot1, filename = "test.png", path = "static/")
def worldbank_plot( df: pd.DataFrame, title: str, dates_are_yearly: bool, figure_size=(12, 6), add_points=False, **plot_kwargs, ) -> p9.ggplot: """ Carefully written to support all worldbank plots, this method is the one place where the app needs themes, colour maps and various plot related settings. For sparse datasets it used geom_point() in addition to geom_line() in case the data is so sparse that lines cannot be drawn. Returns a ggplot instance or raises an exception if the dataframe is empty. """ if df is None: print(f"No usable data/plot for {title}") raise Http404(f"No data for {title}") pct_na = (df["metric"].isnull().sum() / len(df)) * 100.0 assert pct_na >= 0.0 and pct_na <= 100.0 plot = (p9.ggplot(df, p9.aes("date", "metric", **plot_kwargs)) + p9.geom_path(size=1.2) + p9.scale_y_continuous(labels=label_shorten)) if dates_are_yearly: plot += p9.scale_x_datetime(labels=date_format( "%Y")) # yearly data? if so only print the year on the x-axis # if pct_na is too high, geom_path() may be unable to draw a line (each value is surrounded by nan preventing a path) # so we use geom_point() to highlight the sparse nature of the data if pct_na >= 30.0 or add_points or df["metric"].count() <= 3: plot += p9.geom_point(size=3.0) return user_theme(plot, y_axis_label="Value", figure_size=figure_size)
def plot_trend(sample_period="M", ld: LazyDictionary = None) -> str: """ Given a dataframe of a single stock from company_prices() this plots the highest price in each month over the time period of the dataframe. """ assert "stock_df" in ld def inner_date_fmt(dates_to_format): results = [] for d in dates_to_format: d -= timedelta( weeks=4 ) # breaks are set to the end of the month rather than the start... so results.append(d.strftime("%Y-%m")) return results stock_df = ld["stock_df"] # print(stock_df) dataframe = stock_df.filter(items=["last_price"]) dataframe.index = pd.to_datetime(dataframe.index, format="%Y-%m-%d") dataframe = dataframe.resample(sample_period).max() # print(dataframe) plot = ( p9.ggplot( dataframe, p9.aes(x="dataframe.index", y=dataframe.columns[0], fill=dataframe.columns[0]), ) + p9.geom_bar(stat="identity", alpha=0.7) + p9.scale_x_datetime( labels=inner_date_fmt ) # dont print day (always 1st day of month due to resampling) ) return user_theme(plot, y_axis_label="$ AUD", asxtrade_want_fill_continuous=True)
def plot_predict(forecast): p = (ggplot(data=forecast, mapping=aes(x='ds', y='y')) + geom_point(colour='blue', alpha=0.3, na_rm=True) + geom_line(colour='blue', na_rm=True) + geom_line( data=forecast, mapping=aes(x='ds', y='yhat'), colour='red') + geom_ribbon(data=forecast, mapping=aes(ymin='yhat_lower', ymax='yhat_upper'), fill='blue', alpha=0.1) + scale_x_datetime(breaks='1 days', date_labels='%y-%m-%d %H:%M') + xlab('Time') + ylab('Pressure') + theme_bw() + theme(axis_text_x=element_text( angle=45, hjust=1, face='bold', color='black'), axis_text_y=element_text(face='bold', colour='black'))) ggplot.save(p, filename='predict_pressure_chart.png', path=os.path.join(os.path.abspath(os.path.dirname(__file__)), 'png'), width=8, height=6, units='in', dpi=326, verbose=False) return p
def duration_TL(Data): print('======= Creating duration_TL =======') x = Data.Duration[pd.isna(Data.Duration) == True] if ((len(x)+10)) >= len(Data): print("WARNING: All values for Duration are NA's") else: #Filter Symptomes and Correct Durations Symptomes = Data[(Data.Group == "sy") & (Data.Duration < 180)] #Setting data with missing times Symptomes['Date'] = pd.to_datetime(Symptomes['Date']) if len(Symptomes) == 0: print('No duration for TL_2') else: sdate = min(Symptomes["Date"]) # start date edate = max(Symptomes["Date"]) # end date delta = edate - sdate # as timedelta # from datetime import timedelta day = [] for i in range(delta.days + 1): d= sdate + timedelta(days=i) day.append(d) DF = pd.DataFrame(day) DF.columns = ['Date'] data_with_missing_times = pd.merge(DF, Symptomes, on='Date', how='outer') data_with_missing_times.Date = pd.to_datetime(data_with_missing_times.Date) if delta.days > 1825: datebreaks = '18 months' else: if delta.days > 1095: datebreaks = '12 months' else: datebreaks = '6 months' plot = (p9.ggplot(data=data_with_missing_times, mapping=p9.aes(x='Date', y='Duration')) + p9.geom_smooth(color = 'red', size = 5, method="loess", se=False) + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size=33), axis_title = p9.element_text(size = 33,face = 'bold')) + p9.scale_x_datetime(date_labels = '%Y-%m', date_breaks = datebreaks) + p9.labs(x='',y='')) if (len(data_with_missing_times) > 0): plot.save(filename = 'TL_2.jpeg', plot = plot, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') return(print('=================================duration_TL DONE ============================='))
def frequency_TL(Data): print('======= Creating frequency_TL =======') #Filtering Data['date_4'] = Data['date'].dt.date tl4 = Data.groupby("date_4", sort = False, as_index = False).count() tl4 = tl4.iloc[:, 0:2] tl4 = tl4.rename(columns = {"Unnamed: 0": "n"}) sdate = min(tl4["date_4"]) # start date edate = max(tl4["date_4"]) # end date delta = edate - sdate # as timedelta # tl4 = Data.groupby("Date", sort = False, as_index = False).count() # tl4 = tl4.iloc[:, 0:2] # tl4 = tl4.rename(columns = {"Unnamed: 0": "n"}) # tl4['Date'] = pd.to_datetime(tl4['Date']) # #Setting data with missing times # sdate = min(tl4["Date"]) # start date # edate = max(tl4["Date"]) # end date # delta = edate - sdate # as timedelta from datetime import timedelta day = [] for i in range(delta.days + 1): d= sdate + timedelta(days=i) day.append(d) DF = pd.DataFrame(day) DF.columns = ['date_4'] data_with_missing_times = pd.merge(DF, tl4, on='date_4', how='outer') if delta.days > 1825: datebreaks = '18 months' else: if delta.days > 1095: datebreaks = '12 months' else: datebreaks = '6 months' #Creating and saving TL_4 plot =(p9.ggplot(data=data_with_missing_times, mapping=p9.aes(x='date_4',y='n')) + p9.geom_col(fill = 'red') + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size=40), axis_title = p9.element_text(size = 40,face = 'bold')) + p9.scale_x_datetime(date_labels = '%Y-%m', date_breaks = datebreaks) + p9.labs(x='',y='') ) if (len(data_with_missing_times) > 0): plot.save(filename = 'TL_4.jpeg', plot = plot, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') return(print('=================================frequency_TL DONE ============================='))
def create_plot(self, columns): for col in columns: if col not in self.data.columns: raise ValueError('No column "%s" in the data' % col) try: from plotnine import ggplot, theme_bw, aes, geom_line, expand_limits, scale_x_datetime, ylab, facet_wrap, theme from mizani.formatters import date_format except ImportError: msg = """Package 'plotnine' is required for the plot functionnality. Try installing it with 'pip install plotnine'. """ raise RatatouilleDependencyError(msg) data = self.data.copy() if len(columns) > 0: data = data[['timestamp'] + columns] else: if 'hostname' in data: data.drop('hostname', axis=1, inplace=True) data['time_diff'] = data['timestamp'][1:].reset_index( drop=True) - data['timestamp'][:-1].reset_index(drop=True) time_step = data['time_diff'].median() breakpoints = list(data[data['time_diff'] > time_step * 10].timestamp) breakpoints = [ data['timestamp'].min(), *breakpoints, data['timestamp'].max() ] data = data.drop('time_diff', 1).melt('timestamp') import pandas if len(columns) > 0: data['variable'] = pandas.Categorical(data['variable'], categories=columns) plot = ggplot() + theme_bw() for min_t, max_t in zip(breakpoints[:-1], breakpoints[1:]): tmp = data[(data['timestamp'] > min_t) & (data['timestamp'] < max_t)] plot += geom_line(tmp, aes(x='timestamp', y='value', color='variable'), show_legend=False) plot += facet_wrap(['variable'], scales='free') timedelta = self.data.timestamp.max() - self.data.timestamp.min() if timedelta.days > 2: plot += scale_x_datetime(labels=date_format('%Y/%m/%d')) else: plot += scale_x_datetime(labels=date_format('%H:%M')) plot += ylab('Value') return plot
def intensity_TL(Data): print('======= Creating intensity_TL =======') x = Data.Intensity[pd.isna(Data.Intensity) == True] if (len(x) == len(Data)): print("WARNING: All values for Intensity are NA's") else: #Filter Symptomes Symptomes = Data[(Data.Group == "sy")] tl3 = Symptomes.groupby("Date", as_index =False, sort = False)['Intensity'].agg({'Intensity': 'mean'}) #tl3['Day'] = range(1,(len(tl3)+1)) #tl3 = tl3.rename(columns = {'Intensity': "Intensity_mean"}) tl3['Date'] = pd.to_datetime(tl3['Date']) #Setting data with missing times sdate = min(tl3["Date"]) # start date edate = max(tl3["Date"]) # end date delta = edate - sdate # as timedelta # from datetime import timedelta day = [] for i in range(delta.days + 1): d= sdate + timedelta(days=i) day.append(d) DF = pd.DataFrame(day) DF.columns = ['Date'] data_with_missing_times = pd.merge(DF, tl3, on='Date', how='outer') if delta.days > 1825: datebreaks = '18 months' else: if delta.days > 1095: datebreaks = '12 months' else: datebreaks = '6 months' plot =(p9.ggplot(data=data_with_missing_times, mapping=p9.aes(x='Date',y='Intensity')) + p9.geom_point(color = 'red', size = 5) + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size=40), axis_title = p9.element_text(size = 40,face = 'bold')) + p9.scale_x_datetime(date_labels = '%Y-%m', date_breaks = datebreaks) + p9.labs(x='',y='') ) #Creating and saving TL_3 if (len(data_with_missing_times) > 5): #TL3 = TL_3(data_with_missing_times) plot.save(filename = 'TL_3.jpeg', plot = plot, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') return(print('=================================intensity_TL DONE ============================='))
def plot_chart(df, category='temperature', time_interval=None): if category not in ['temperature', 'pressure']: raise TypeError( 'category: {} not in temperature or pressure'.format(category)) elif category == 'temperature': key_words = 'TEMP_Value' else: key_words = 'PSIG_Value' var_list = [ variable for variable in df.columns.values if key_words in variable ] df_var = df[['Timestamp'] + var_list] df_var = df_var.dropna(axis=0) df_var = df_var.melt(id_vars=['Timestamp'], var_name='ITEM', value_name='Value') df_var['Timestamp'] = pd.to_datetime(df_var['Timestamp']) if time_interval is None: time_interval = [min(df_var['Timestamp']), max(df_var['Timestamp'])] p = ( ggplot(data=df_var, mapping=aes(x='Timestamp', y='Value')) + geom_point(alpha=0.2, mapping=aes(colour='factor(ITEM)'), na_rm=True) + geom_line(mapping=aes(colour='factor(ITEM)'), na_rm=True) + scale_x_datetime(limits=pd.to_datetime(time_interval), breaks='1 days', date_labels='%y-%m-%d %H:%M') + theme_bw() + theme(axis_text_x=element_text( angle=45, hjust=0.5, face='bold', color='black'), axis_text_y=element_text(face='bold', colour='black'), legend_title=element_text(face='bold', colour='black'), legend_position='right', legend_direction="vertical")) ggplot.save(p, filename=category + '_chart' + '.png', path=os.path.join(os.path.abspath(os.path.dirname(__file__)), 'png'), width=8, height=6, units='in', dpi=326, verbose=False) return p
def plot_arima(df): df['Timestamp'] = pd.to_datetime(df['Timestamp']) p = ( ggplot(data=df, mapping=aes(x='Timestamp', y=df.columns.values[1])) + geom_point(colour='blue', alpha=0.3, na_rm=True) + geom_line(colour='blue', na_rm=True) + geom_point(mapping=aes(x='Timestamp', y=df.columns.values[2]), colour='red', alpha=0.3, na_rm=True) + geom_line(mapping=aes(x='Timestamp', y=df.columns.values[2]), colour='red', na_rm=True) + geom_vline(xintercept=max(df[['Timestamp', df.columns.values[1] ]].dropna(axis=0)['Timestamp']), color='green', linetype='dashed') + # geom_line(mapping=aes(x='Timestamp', y='Lower'), colour='green', na_rm=True, alpha=0.3) + # geom_line(mapping=aes(x='Timestamp', y='Upper'), colour='green', na_rm=True, alpha=0.3) + geom_ribbon(data=df, mapping=aes(ymin='Lower', ymax='Upper'), fill='red', alpha=0.1) + scale_x_datetime(breaks='1 days', date_labels='%y-%m-%d %H:%M') + xlab('Time') + ylab(df.columns.values[1]) + theme_bw() + theme(axis_text_x=element_text( angle=45, hjust=1, face='bold', color='black'), axis_text_y=element_text(face='bold', colour='black'))) ggplot.save(p, filename=df.columns.values[1] + '_predict.png', path=os.path.join(os.path.abspath(os.path.dirname(__file__)), 'png'), width=8, height=6, units='in', dpi=326, verbose=False) return p
def general(Data): logging.info('======= Creating general =======') print('======= Creating general =======') x = Data.Intensity[pd.isna(Data.Intensity) == True] if (len(x) == len(Data)): print("WARNING: All values for Intensity are NA's") else: Data['Minutesss'] = Data['date'] Data['Minutesss'] = pd.to_datetime(Data['Minutesss'], errors='coerce') Data.date= pd.to_datetime(Data.date, errors = 'coerce') Data['Minutesss'] = Data['Minutesss'].dt.hour*60 + Data['Minutesss'].dt.minute #Data.Intensity = Data.Intensity.astype(str) #Data.Intensity = Data.Intensity.astype(float) #Data.Intensity.fillna('0', inplace=True) plot =(p9.ggplot(data=Data, mapping=p9.aes(x='date',y='Minutesss', colour = 'Intensity')) + p9.geom_point(size = 2) #+ p9.geom_smooth(method="loess", se=False, color = 'tomato', size = 5) + p9.theme_classic() + p9.scale_colour_gradient(low = "white", high = "red", aesthetics = "colour") + p9.theme(axis_text = p9.element_text(size=18), axis_title = p9.element_text(size = 18,face = 'bold'), legend_position = 'none') + p9.scale_x_datetime(date_labels = '%b %y', date_breaks = '6 months') + p9.labs(x='',y='', colour = 'Intensity: ') ) #Creating and saving TL_1 if (len(Data) > 0): #TL1 = TL_1(Data) plot.save(filename = 'TL_1.jpeg', plot = plot, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') return(print('=================================general DONE ============================='))
def show_community_prediction( self, percent_kept: float = 0.95, side_cut_from: str = "both", num_samples: int = 1000, bins: int = 50, ): """ Plot samples from the community prediction on this question :param percent_kept: percentage of sample distrubtion to keep :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper' :param num_samples: number of samples from the community :param bins: The number of bins in the histogram, the more bins, the more 'fine grained' the graph. Fewer bins results in more aggregation :return: ggplot graphics object """ community_samples = pd.Series([ self.sample_normalized_community() for _ in range(0, num_samples) ]) (_xmin, _xmax) = self.get_central_quantiles(community_samples, percent_kept=percent_kept, side_cut_from=side_cut_from) _xmin, _xmax = self.denormalize_samples([_xmin, _xmax]) df = pd.DataFrame( data={"samples": self.denormalize_samples(community_samples)}) title_name = ( f"Q: {self.name}" if self.name else "\n".join( textwrap.wrap(self.data["title"], 60)) # type: ignore ) return (ggplot(df, aes("samples")) + geom_histogram(fill="#b3cde3", bins=bins) + scale_x_datetime(limits=(_xmin, _xmax)) + labs(x="Prediction", y="Counts", title=title_name) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1)))
def plot_portfolio_vs_benchmark(cumulative_returns, benchmark_cum_returns): benchmark_cum_returns = benchmark_cum_returns.rename( columns={"benchmark": "returns"}) benchmark_cum_returns['key'] = "benchmark" cumulative_returns['key'] = "portfolio" cumulative_returns["returns"] = cumulative_returns["returns"] df = cumulative_returns.append(benchmark_cum_returns) df.index.name = 'date' df.reset_index(level=0, inplace=True) df['returns'] = df['returns'] * 100 warnings.filterwarnings('ignore') df.to_csv(data_path + portfolio_name + 'returns.csv', header=True) r = (ggplot(df) + aes(x='date', y='returns', color='key', group='key') + geom_line() + scale_x_datetime(breaks=date_breaks('1 years'), labels=date_format('%Y')) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title=portfolio_name + 'portfolio vs. benchmark', y='Returns %')) r.save(filename=portfolio_name + 'returns.png', format="png", path=results_path, width=6.4, height=4.8, dpi=125) warnings.filterwarnings('default')
def plot_drawdowns(cumulative_returns, benchmark_cum_returns): """Any time the cumulative returns dips below the current cumulative maximum returns, it's a drawdown. Drawdowns are measured as a percentage of that maximum cumulative return, in effect, measured from peak equity.""" benchmark_drawdown = get_drawdown(benchmark_cum_returns) benchmark_drawdown = benchmark_drawdown.to_frame() benchmark_drawdown = benchmark_drawdown.rename( columns={"benchmark": "drawdown"}) benchmark_drawdown['key'] = "benchmark" benchmark_drawdown.index.name = 'date' benchmark_drawdown.reset_index(level=0, inplace=True) portfolio_drawdown = get_drawdown(cumulative_returns) portfolio_drawdown = portfolio_drawdown.to_frame() portfolio_drawdown['key'] = "portfolio" portfolio_drawdown = portfolio_drawdown.rename( columns={"returns": "drawdown"}) portfolio_drawdown.index.name = 'date' portfolio_drawdown.reset_index(level=0, inplace=True) mask = benchmark_drawdown.date.isin(portfolio_drawdown.date) benchmark_drawdown = benchmark_drawdown[mask] df = portfolio_drawdown.append(benchmark_drawdown) df.to_csv(data_path + portfolio_name + 'drawdowns.csv', header=True) warnings.filterwarnings('ignore') d = (ggplot(df) + aes(x='date', y='drawdown', color='key', group='key') + geom_line() + scale_x_datetime(breaks=date_breaks('1 years'), labels=date_format('%Y')) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title=portfolio_name + 'portfolio vs. benchmark', y='Drawdown % (change peak to trough)')) d.save(filename=portfolio_name + 'drawdowns.png', format="png", path=results_path, width=6.4, height=4.8, dpi=125) warnings.filterwarnings('default')
def make_plots(leak_df, time_df, site_df, sim_n, spin_up, output_directory): """ This function makes a set of standard plots to output at end of simulation. """ # Temporarily mute warnings warnings.filterwarnings('ignore') pn.theme_set(pn.theme_linedraw()) # Chop off spin-up year (only for plots, still exists in raw output) time_df_adj = time_df.iloc[spin_up:, ] # Timeseries plots plot_time_1 = ( pn.ggplot(time_df_adj, pn.aes('datetime', 'daily_emissions_kg')) + pn.geom_line(size=2) + pn.ggtitle('Daily emissions from all sites (kg)') + pn.ylab('') + pn.xlab('') + pn.scale_x_datetime(labels=date_format('%Y')) + pn.theme( panel_border=pn.element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5))) plot_time_1.save(output_directory + '/plot_time_emissions_' + sim_n + '.png', width=10, height=3, dpi=300) plot_time_2 = (pn.ggplot(time_df_adj, pn.aes('datetime', 'active_leaks')) + pn.geom_line(size=2) + pn.ggtitle('Number of active leaks at all sites') + pn.ylab('') + pn.xlab('') + pn.scale_x_datetime(labels=date_format('%Y')) + pn.theme(panel_border=pn.element_rect( colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5))) plot_time_2.save(output_directory + '/plot_time_active_' + sim_n + '.png', width=10, height=3, dpi=300) # Site-level plots plot_site_1 = ( pn.ggplot(site_df, pn.aes('cum_frac_sites', 'cum_frac_emissions')) + pn.geom_line(size=2) + pn.theme( panel_border=pn.element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5)) + pn.xlab('Cumulative fraction of sites') + pn.ylab('Cumulative fraction of emissions') + pn.ggtitle('Empirical cumulative distribution of site-level emissions') ) plot_site_1.save(output_directory + '/site_cum_dist_' + sim_n + '.png', width=5, height=4, dpi=300) # Leak plots plot_leak_1 = (pn.ggplot(leak_df, pn.aes('days_active')) + pn.geom_histogram(colour='gray') + pn.theme(panel_border=pn.element_rect( colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5)) + pn.ggtitle('Distribution of leak duration') + pn.xlab('Number of days the leak was active') + pn.ylab('Count')) plot_leak_1.save(output_directory + '/leak_active_hist' + sim_n + '.png', width=5, height=4, dpi=300) plot_leak_2 = (pn.ggplot( leak_df, pn.aes('cum_frac_leaks', 'cum_frac_rate', colour='status')) + pn.geom_line(size=2) + pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) + pn.theme(panel_border=pn.element_rect( colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5)) + pn.xlab('Cumulative fraction of leak sources') + pn.ylab('Cumulative leak rate fraction') + pn.ggtitle('Fractional cumulative distribution')) plot_leak_2.save(output_directory + '/leak_cum_dist1_' + sim_n + '.png', width=4, height=4, dpi=300) plot_leak_3 = (pn.ggplot( leak_df, pn.aes('cum_frac_leaks', 'cum_rate', colour='status')) + pn.geom_line(size=2) + pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) + pn.theme(panel_border=pn.element_rect( colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5)) + pn.scale_y_continuous(trans='log10') + pn.xlab('Cumulative fraction of leak sources') + pn.ylab('Cumulative emissions (kg/day)') + pn.ggtitle('Absolute cumulative distribution')) plot_leak_3.save(output_directory + '/leak_cum_dist2_' + sim_n + '.png', width=4, height=4, dpi=300) return
def batch_plots(self): # First, put together active leak data and output for live plotting functionality # (no AL plot here currently) dfs = self.active_leak_dfs for i in range(len(dfs)): n_cols = dfs[i].shape[1] dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1) dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1) dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1) dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1) dfs[i]['program'] = self.directories[i] # Move reference program to the top of the list for i, df in enumerate(dfs): if df['program'].iloc[0] == self.ref_program: dfs.insert(0, dfs.pop(i)) # Arrange dfs for plot 1 dfs_p1 = dfs.copy() for i in range(len(dfs_p1)): # Reshape dfs_p1[i] = pd.melt(dfs_p1[i], id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program']) # Combine dataframes into single dataframe for plotting df_p1 = dfs_p1[0] for i in dfs_p1[1:]: df_p1 = df_p1.append(i, ignore_index=True) # Output Emissions df for other uses (e.g. live plot) df_p1.to_csv(self.output_directory + 'mean_active_leaks.csv', index=True) # Now repeat for emissions (which will actually be used for batch plotting) dfs = self.emission_dfs for i in range(len(dfs)): n_cols = dfs[i].shape[1] dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1) dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1) dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1) dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1) dfs[i]['program'] = self.directories[i] # Move reference program to the top of the list for i, df in enumerate(dfs): if df['program'].iloc[0] == self.ref_program: dfs.insert(0, dfs.pop(i)) # Arrange dfs for plot 1 dfs_p1 = dfs.copy() for i in range(len(dfs_p1)): # Reshape dfs_p1[i] = pd.melt(dfs_p1[i], id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program']) # Combine dataframes into single dataframe for plotting df_p1 = dfs_p1[0] for i in dfs_p1[1:]: df_p1 = df_p1.append(i, ignore_index=True) # Output Emissions df for other uses (e.g. live plot) df_p1.to_csv(self.output_directory + 'mean_emissions.csv', index=True) # Make plots from list of dataframes - one entry per dataframe pn.theme_set(pn.theme_linedraw()) plot1 = (pn.ggplot(None) + pn.aes('datetime', 'value', group='program') + pn.geom_ribbon(df_p1, pn.aes(ymin='low', ymax='high', fill='program'), alpha=0.2) + pn.geom_line(df_p1, pn.aes('datetime', 'mean', colour='program'), size=1) + pn.ylab('Daily emissions (kg/site)') + pn.xlab('') + pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) + pn.scale_x_datetime(labels=date_format('%Y')) + pn.scale_y_continuous(trans='log10') + pn.ggtitle('To reduce uncertainty, use more simulations.') + pn.labs(color='Program', fill='Program') + pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5)) ) plot1.save(self.output_directory + 'program_comparison.png', width=7, height=3, dpi=900) # Build relative mitigation plots dfs_p2 = dfs.copy() for i in dfs_p2[1:]: i['mean_dif'] = 0 i['std_dif'] = 0 i['mean_ratio'] = 0 i['std_ratio'] = 0 for j in range(len(i)): ref_mean = dfs_p2[0].loc[dfs_p2[0].index[j], 'mean'] ref_std = dfs_p2[0].loc[dfs_p2[0].index[j], 'std'] alt_mean = i.loc[i.index[j], 'mean'] alt_std = i.loc[i.index[j], 'std'] i.loc[i.index[j], 'mean_dif'] = alt_mean - ref_mean i.loc[i.index[j], 'std_dif'] = math.sqrt( math.pow(alt_std, 2) + math.pow(ref_std, 2)) i.loc[i.index[j], 'mean_ratio'] = alt_mean / ref_mean i.loc[i.index[j], 'std_ratio'] = math.sqrt( math.pow((alt_std / alt_mean), 2) + math.pow((ref_std / ref_mean), 2)) # Build plotting dataframe df_p2 = self.dates_trunc.copy().to_frame() df_p2['program'] = dfs_p2[1]['program'] df_p2['mean_dif'] = dfs_p2[1]['mean_dif'] df_p2['std_dif'] = dfs_p2[1]['std_dif'] df_p2['mean_ratio'] = dfs_p2[1]['mean_ratio'] df_p2['std_ratio'] = dfs_p2[1]['std_ratio'] df_p2['low_dif'] = dfs_p2[1]['mean_dif'] - 2 * dfs_p2[1]['std_dif'] df_p2['high_dif'] = dfs_p2[1]['mean_dif'] + 2 * dfs_p2[1]['std_dif'] df_p2['low_ratio'] = dfs_p2[1]['mean_ratio'] / (dfs_p2[1] ['mean_ratio'] + 2 * dfs_p2[1]['std_ratio']) df_p2['high_ratio'] = dfs_p2[1]['mean_ratio'] + 2 * dfs_p2[1]['std_ratio'] pd.options.mode.chained_assignment = None for i in dfs_p2[2:]: i['low_dif'] = i['mean_dif'] - 2 * i['std_dif'] i['high_dif'] = i['mean_dif'] + 2 * i['std_dif'] i['low_ratio'] = i['mean_ratio'] / (i['mean_ratio'] + 2 * i['std_ratio']) i['high_ratio'] = i['mean_ratio'] + 2 * i['std_ratio'] short_df = i[['program', 'mean_dif', 'std_dif', 'low_dif', 'high_dif', 'mean_ratio', 'std_ratio', 'low_ratio', 'high_ratio']] short_df['datetime'] = np.array(self.dates_trunc) df_p2 = df_p2.append(short_df, ignore_index=True) # Make plot 2 plot2 = (pn.ggplot(None) + pn.aes('datetime', 'mean_dif', group='program') + pn.geom_ribbon( df_p2, pn.aes(ymin='low_dif', ymax='high_dif', fill='program'), alpha=0.2) + pn.geom_line(df_p2, pn.aes('datetime', 'mean_dif', colour='program'), size=1) + pn.ylab('Daily emissions difference (kg/site)') + pn.xlab('') + pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) + pn.scale_x_datetime(labels=date_format('%Y')) + pn.ggtitle('Daily differences may be uncertain for small sample sizes') + # pn.scale_y_continuous(trans='log10') + pn.labs(color='Program', fill='Program') + pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5)) ) plot2.save(self.output_directory + 'relative_mitigation.png', width=7, height=3, dpi=900) # Make plot 3 plot3 = (pn.ggplot(None) + pn.aes('datetime', 'mean_ratio', group='program') + pn.geom_ribbon(df_p2, pn.aes( ymin='low_ratio', ymax='high_ratio', fill='program'), alpha=0.2) + pn.geom_hline(yintercept=1, size=0.5, colour='blue') + pn.geom_line(df_p2, pn.aes('datetime', 'mean_ratio', colour='program'), size=1) + pn.ylab('Emissions ratio') + pn.xlab('') + pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) + pn.scale_x_datetime(labels=date_format('%Y')) + pn.ggtitle( 'Blue line represents equivalence. \nIf uncertainty is high, use more ' 'simulations and/or sites. \nLook also at ratio of mean daily emissions' 'over entire timeseries.') + pn.labs(color='Program', fill='Program') + pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5)) ) plot3.save(self.output_directory + 'relative_mitigation2.png', width=7, height=3, dpi=900) # --------------------------------------- # ------ Figure to compare costs ------ dfs = self.cost_dfs for i in range(len(dfs)): n_cols = dfs[i].shape[1] dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1) dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1) dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1) dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1) dfs[i]['program'] = self.directories[i] # Move reference program to the top of the list for i, df in enumerate(dfs): if df['program'].iloc[0] == self.ref_program: dfs.insert(0, dfs.pop(i)) # Arrange dfs for plot 1 dfs_p1 = dfs.copy() for i in range(len(dfs_p1)): # Reshape dfs_p1[i] = pd.melt(dfs_p1[i], id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program']) # Combine dataframes into single dataframe for plotting df_p1 = dfs_p1[0] for i in dfs_p1[1:]: df_p1 = df_p1.append(i, ignore_index=True) # Output Emissions df for other uses (e.g. live plot) df_p1.to_csv(self.output_directory + 'rolling_cost_estimates.csv', index=True) # Make plots from list of dataframes - one entry per dataframe pn.theme_set(pn.theme_linedraw()) plot1 = (pn.ggplot(None) + pn.aes('datetime', 'value', group='program') + pn.geom_ribbon(df_p1, pn.aes(ymin='low', ymax='high', fill='program'), alpha=0.2) + pn.geom_line(df_p1, pn.aes('datetime', 'mean', colour='program'), size=1) + pn.ylab('Estimated cost per facility') + pn.xlab('') + pn.scale_colour_hue(h=0.15, l=0.25, s=0.9) + pn.scale_x_datetime(labels=date_format('%Y')) + # pn.scale_y_continuous(trans='log10') + pn.labs(color='Program', fill='Program') + pn.theme(panel_border=pn.element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5)) ) plot1.save(self.output_directory + 'cost_estimate_temporal.png', width=7, height=3, dpi=900) ######################################## # Cost breakdown by program and method method_lists = [] for i in range(len(self.directories)): df = pd.read_csv( self.output_directory + self.directories[i] + "/timeseries_output_0.csv") df = df.filter(regex='cost$', axis=1) df = df.drop(columns=["total_daily_cost"]) method_lists.append(list(df)) costs = [[] for i in range(len(self.all_data))] for i in range(len(self.all_data)): for j in range(len(self.all_data[i])): simcosts = [] for k in range(len(method_lists[i])): timesteps = len(self.all_data[i][j][method_lists[i][k]]) simcosts.append( (sum(self.all_data[i][j][method_lists[i][k]])/timesteps/self.n_sites)*365) costs[i].append(simcosts) rows_list = [] for i in range(len(costs)): df_temp = pd.DataFrame(costs[i]) for j in range(len(df_temp.columns)): dict = {} dict.update({'Program': self.directories[i]}) dict.update({'Mean Cost': round(df_temp.iloc[:, j].mean())}) dict.update({'St. Dev.': df_temp.iloc[:, j].std()}) dict.update({'Method': method_lists[i][j].replace('_cost', '')}) rows_list.append(dict) df = pd.DataFrame(rows_list) # Output Emissions df for other uses df.to_csv(self.output_directory + 'cost_comparison.csv', index=True) plot = ( pn.ggplot( df, pn.aes( x='Program', y='Mean Cost', fill='Method', label='Mean Cost')) + pn.geom_bar(stat="identity") + pn.ylab('Cost per Site per Year') + pn.xlab('Program') + pn.scale_fill_hue(h=0.15, l=0.25, s=0.9) + pn.geom_text(size=15, position=pn.position_stack(vjust=0.5)) + pn.theme( panel_border=pn.element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=pn.element_blank(), panel_grid_major_x=pn.element_blank(), panel_grid_minor_y=pn.element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=pn.element_line( colour='black', linewidth=1, alpha=0.5))) plot.save(self.output_directory + 'cost_comparison.png', width=7, height=3, dpi=900) return
def marginal_plot(df, x, y, group = None, facet_x = None, facet_y = None, aggfun = 'sum', bins=21, use_quantiles = False, label_pos='auto', label_function=ez_labels, sort_groups=True, base_size=10, figure_size=(6, 3)): ''' Bin the data in a df and plot it using lines. Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str quoted expression to be plotted on the y axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet aggfun : str or fun function to be used for aggregating (eg sum, mean, median ...) bins : int or tuple number of bins to be used use_quantiles : bool bin data using quantiles label_pos : str Use count label on each point. Choose between None, 'auto' or 'force' label_function : callable labelling function sort_groups : bool sort groups by the sum of their value (otherwise alphabetical order is used) base_size : int base size for theme_ez figure_size :tuple of int figure size Returns ------- g : EZPlot EZplot object ''' if label_pos not in [None, 'auto', 'force']: log.error("label_pos not recognized") raise NotImplementedError("label_pos not recognized") elif label_pos == 'auto': if bins<=21 and group is None: show_labels=True else: show_labels=False else: show_labels = True if label_pos=='force' else False # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) names['y'], variables['y'] = unname(y) # set column names and evaluate expressions tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False) # redefine groups and variables; remove and store (eventual) names new_groups = {c:c for c in tmp_df.columns if c in ['x', 'group', 'facet_x', 'facet_y']} new_variables = {'y': 'y'} # bin data if use_quantiles: quantile_groups = [c for c in tmp_df.columns if c in ['group', 'facet_x', 'facet_y']] if len(quantile_groups)>0: tmp_df['x'] = tmp_df.groupby(quantile_groups)['x'].apply(lambda x: qbin_data(x, bins)) else: tmp_df['x'] = qbin_data(tmp_df['x'], bins) else: tmp_df['x'], _, _ = bin_data(tmp_df['x'], bins, None) # aggregate data and reorder columns gdata = agg_data(tmp_df, new_variables, new_groups, aggfun, fill_groups=False) # reorder columns gdata = gdata[[c for c in ['x', 'y', 'group', 'facet_x', 'facet_y'] if c in gdata.columns]] # init plot obj g = EZPlot(gdata) # determine order and create a categorical type if sort_groups: sort_data_groups(g) # get colors colors = np.flip(ez_colors(g.n_groups('group'))) # set groups if group is None: g += p9.geom_line(p9.aes(x="x", y="y"), group=1, colour=colors[0]) if show_labels: g += p9.geom_point(p9.aes(x="x", y="y"), group=1, colour=colors[0]) else: g += p9.geom_line(p9.aes(x="x", y="y", group="factor(group)", colour="factor(group)")) if show_labels: g += p9.geom_point(p9.aes(x="x", y="y", colour="factor(group)")) g += p9.scale_color_manual(values=colors) # set labels if show_labels: groups_to_count = [c for c in tmp_df.columns if c in ['x', 'group', 'facet_x', 'facet_y']] tmp_df['counts']=1 top_labels = tmp_df \ .groupby(groups_to_count)['counts'] \ .sum()\ .reset_index() top_labels['label'] = label_function(top_labels['counts']) # make sure labels and data can be joined for c in ['group', 'facet_x', 'facet_y']: if c in tmp_df.columns: try: top_labels[c] = pd.Categorical(top_labels[c].astype(str), categories = g.data[c].cat.categories, ordered = g.data[c].cat.ordered) except: pass #return g.data, top_labels g.data = pd.merge(g.data, top_labels, on=groups_to_count, how='left') g.data['label_pos'] = g.data['y'] + \ np.sign(g.data['y'])*g.data['y'].abs().max()*0.02 g += p9.geom_text(p9.aes(x='x', y='label_pos', label='label'), color="#000000", size=base_size * 0.7, ha='center', va='bottom') # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_timestamp('x'): g += p9.scale_x_datetime() elif g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) return g
def show_prediction( self, samples, percent_kept: float = 0.95, side_cut_from: str = "both", show_community: bool = False, num_samples: int = 1000, bins: int = 50, ): """Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions :param samples: samples from a distribution answering the prediction question (true scale) or a prediction object :param percent_kept: percentage of sample distrubtion to keep :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper' :param show_community: boolean indicating whether comparison to community predictions should be made :param num_samples: number of samples from the community :param bins: The number of bins in the histogram, the more bins, the more 'fine grained' the graph. Fewer bins results in more aggregation :return: ggplot graphics object """ if isinstance(samples, SubmissionMixtureParams): prediction = samples prediction_normed_samples = pd.Series([ logistic.sample_mixture(prediction) for _ in range(0, num_samples) ]) else: if isinstance(samples, list): samples = pd.Series(samples) if not type(samples) in [pd.Series, np.ndarray]: raise ValueError( "Samples should be a list, numpy arrray or pandas series") num_samples = samples.shape[0] prediction_normed_samples = self.normalize_samples(samples) title_name = ( f"Q: {self.name}" if self.name else "\n".join( textwrap.wrap(self.data["title"], 60)) # type: ignore ) if show_community: df = pd.DataFrame( data={ "community": [ # type: ignore self.sample_normalized_community() for _ in range(0, num_samples) ], "prediction": prediction_normed_samples, # type: ignore }) # import pdb # pdb.set_trace() # get domain for graph given the percentage of distribution kept (_xmin, _xmax) = self.get_central_quantiles(df, percent_kept=percent_kept, side_cut_from=side_cut_from) _xmin, _xmax = self.denormalize_samples([_xmin, _xmax]) df["prediction"] = self.denormalize_samples(df["prediction"]) df["community"] = self.denormalize_samples(df["community"]) df = pd.melt(df, var_name="sources", value_name="samples") # type: ignore return (ggplot(df, aes("samples", fill="sources")) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_histogram(position="identity", alpha=0.9) + scale_x_datetime(limits=(_xmin, _xmax)) + facet_wrap("sources", ncol=1) + labs( x="Prediction", y="Counts", title=title_name, ) + guides(fill=False) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1))) else: (_xmin, _xmax) = self.get_central_quantiles( prediction_normed_samples, percent_kept=percent_kept, side_cut_from=side_cut_from, ) _xmin, _xmax = self.denormalize_samples([_xmin, _xmax]) df = pd.DataFrame(data={ "prediction": self.denormalize_samples(prediction_normed_samples) }) return (ggplot(df, aes("prediction")) + geom_histogram(fill="#b3cde3", bins=bins) # + coord_cartesian(xlim = (_xmin,_xmax)) + scale_x_datetime(limits=(_xmin, _xmax)) + labs(x="Prediction", y="Counts", title=title_name) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1)))
def area_plot(df, x, y, group=None, facet_x=None, facet_y=None, aggfun='sum', fill=False, sort_groups=True, base_size=10, figure_size=(6, 3)): ''' Aggregates data in df and plots as a stacked area chart. Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str quoted expression to be plotted on the y axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet aggfun : str or fun function to be used for aggregating (eg sum, mean, median ...) fill : bool plot shares for each group instead of absolute values sort_groups : bool sort groups by the sum of their value (otherwise alphabetical order is used) base_size : int base size for theme_ez figure_size :tuple of int figure size Returns ------- g : EZPlot EZplot object ''' # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) names['y'], variables['y'] = unname(y) # fix special cases if x == '.index': groups['x'] = '.index' names[ 'x'] = dataframe.index.name if dataframe.index.name is not None else '' # aggregate data and reorder columns gdata = agg_data(dataframe, variables, groups, aggfun, fill_groups=True) gdata['y'].fillna(0, inplace=True) gdata = gdata[[ c for c in ['x', 'y', 'group', 'facet_x', 'facet_y'] if c in gdata.columns ]] if fill: groups_to_normalize = [ c for c in ['x', 'facet_x', 'facet_y'] if c in gdata.columns ] total_values = gdata \ .groupby(groups_to_normalize)['y'] \ .sum() \ .reset_index() \ .rename(columns = {'y':'tot_y'}) gdata = pd.merge(gdata, total_values, on=groups_to_normalize) gdata['y'] = gdata['y'] / (gdata['tot_y'] + EPSILON) gdata.drop('tot_y', axis=1, inplace=True) ylabeller = percent_labels else: ylabeller = ez_labels # get plot object g = EZPlot(gdata) # determine order and create a categorical type if sort_groups: sort_data_groups(g) # get colors colors = np.flip(ez_colors(g.n_groups('group'))) # set groups if group is None: g += p9.geom_area(p9.aes(x="x", y="y"), colour=None, fill=ez_colors(1)[0], na_rm=True) else: g += p9.geom_area(p9.aes(x="x", y="y", group="factor(group)", fill="factor(group)"), colour=None, na_rm=True) g += p9.scale_fill_manual(values=colors) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_timestamp('x'): g += p9.scale_x_datetime() elif g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ylabeller, expand=[0, 0, 0.1 * (not fill) + 0.03, 0]) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) if sort_groups: g += p9.guides(fill=p9.guide_legend(reverse=True), color=p9.guide_legend(reverse=True)) return g
group="group", colour="group"), na_rm=True) g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group'))) g += p9.scale_color_manual(values=ez_colors(g.n_groups('group'))) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_timestamp('x'): g += p9.scale_x_datetime() elif g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size,
def scatter_plot(df, x, y, group=None, facet_x=None, facet_y=None, base_size=10, figure_size=(6, 3), **kwargs): ''' Aggregates data in df and plots as a scatter plot chart. Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str quoted expression to be plotted on the y axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet base_size : int base size for theme_ez figure_size :tuple of int figure size **kwargs: additional kwargs passed to geom_point Returns ------- g : EZPlot EZplot object ''' # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) names['y'], variables['y'] = unname(y) # fix special cases if x == '.index': groups['x'] = '.index' names[ 'x'] = dataframe.index.name if dataframe.index.name is not None else '' # aggregate data and reorder columns gdata = agg_data(dataframe, variables, groups, None, fill_groups=True) gdata = gdata[[ c for c in ['x', 'y', 'group', 'facet_x', 'facet_y'] if c in gdata.columns ]] # add group_x column if group is not None: gdata['group_x'] = gdata['group'].astype( 'str') + '_' + gdata['x'].astype(str) g = EZPlot(gdata) # set groups if group is None: g += p9.geom_point(p9.aes(x="x", y="y"), colour=ez_colors(1)[0], **kwargs) else: g += p9.geom_point( p9.aes(x="x", y="y", group="factor(group)", color="factor(group)"), **kwargs) g += p9.scale_color_manual(values=ez_colors(g.n_groups('group'))) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_timestamp('x'): g += p9.scale_x_datetime() elif g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale if g.column_is_timestamp('y'): g += p9.scale_y_datetime() elif g.column_is_categorical('y'): g += p9.scale_y_discrete() else: g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) return g
def line_plot(df, x, y, group=None, facet_x=None, facet_y=None, aggfun='sum', err=None, show_points=False, base_size=10, figure_size=(6, 3)): ''' Aggregates data in df and plots multiple columns as a line chart. Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str or list of str quoted expression(s) to be plotted on the y axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet aggfun : str or fun function to be used for aggregating (eg sum, mean, median ...) err : str quoted expression to be used as error shaded area show_points : bool show/hide markers base_size : int base size for theme_ez figure_size :tuple of int figure size Returns ------- g : EZPlot EZplot object ''' if group is not None and isinstance(y, list) and len(y) > 1: log.error( "groups can be specified only when a single y column is present") raise ValueError( "groups can be specified only when a single y column is present") if err is not None and isinstance(y, list) and len(y) > 1: log.error( "err can be specified only when a single y column is present") raise ValueError( "err can be specified only when a single y column is present") if isinstance(y, list) and len(y) == 1: y = y[0] # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) # fix special cases if x == '.index': groups['x'] = '.index' names[ 'x'] = dataframe.index.name if dataframe.index.name is not None else '' if isinstance(y, list): ys = [] for i, var in enumerate(y): ys.append('y_{}'.format(i)) names['y_{}'.format(i)], variables['y_{}'.format(i)] = unname(var) # aggregate data tmp_gdata = agg_data(dataframe, variables, groups, aggfun, fill_groups=True) groups_present = [ c for c in ['x', 'facet_x', 'facet_y'] if c in tmp_gdata.columns ] gdata = pd.melt(tmp_gdata, groups_present, var_name='group', value_name='y') gdata['group'] = gdata['group'].replace( {var: names[var] for var in ys}) # update values for plotting names['y'] = 'Value' names['group'] = 'Variable' group = 'Variable' else: names['y'], variables['y'] = unname(y) if err is not None: names['err'], variables['err'] = unname(err) # aggregate data gdata = agg_data(dataframe, variables, groups, aggfun, fill_groups=True) # reorder columns gdata = gdata[[ c for c in ['x', 'y', 'err', 'group', 'facet_x', 'facet_y'] if c in gdata.columns ]] if err is not None: gdata['ymax'] = gdata['y'] + gdata['err'] gdata['ymin'] = gdata['y'] - gdata['err'] # init plot obj g = EZPlot(gdata) # set groups if group is None: g += p9.geom_line(p9.aes(x="x", y="y"), group=1, colour=ez_colors(1)[0]) if show_points: g += p9.geom_point(p9.aes(x="x", y="y"), group=1, colour=ez_colors(1)[0]) if err is not None: g += p9.geom_ribbon(p9.aes(x="x", ymax="ymax", ymin="ymin"), group=1, fill=ez_colors(1)[0], alpha=0.2) else: g += p9.geom_line( p9.aes(x="x", y="y", group="factor(group)", colour="factor(group)")) if show_points: g += p9.geom_point(p9.aes(x="x", y="y", colour="factor(group)")) if err is not None: g += p9.geom_ribbon(p9.aes(x="x", ymax="ymax", ymin="ymin", fill="factor(group)"), alpha=0.2) g += p9.scale_color_manual(values=ez_colors(g.n_groups('group'))) g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group'))) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_timestamp('x'): g += p9.scale_x_datetime() elif g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) return g
def _scale_x(self, xmin: float = None, xmax: float = None): return scale_x_datetime(limits=(xmin, xmax))