def worldbank_plot( df: pd.DataFrame, title: str, dates_are_yearly: bool, figure_size=(12, 6), add_points=False, **plot_kwargs, ) -> p9.ggplot: """ Carefully written to support all worldbank plots, this method is the one place where the app needs themes, colour maps and various plot related settings. For sparse datasets it used geom_point() in addition to geom_line() in case the data is so sparse that lines cannot be drawn. Returns a ggplot instance or raises an exception if the dataframe is empty. """ if df is None: print(f"No usable data/plot for {title}") raise Http404(f"No data for {title}") pct_na = (df["metric"].isnull().sum() / len(df)) * 100.0 assert pct_na >= 0.0 and pct_na <= 100.0 plot = (p9.ggplot(df, p9.aes("date", "metric", **plot_kwargs)) + p9.geom_path(size=1.2) + p9.scale_y_continuous(labels=label_shorten)) if dates_are_yearly: plot += p9.scale_x_datetime(labels=date_format( "%Y")) # yearly data? if so only print the year on the x-axis # if pct_na is too high, geom_path() may be unable to draw a line (each value is surrounded by nan preventing a path) # so we use geom_point() to highlight the sparse nature of the data if pct_na >= 30.0 or add_points or df["metric"].count() <= 3: plot += p9.geom_point(size=3.0) return user_theme(plot, y_axis_label="Value", figure_size=figure_size)
def test_date_format(): x = pd.date_range('1/1/2010', periods=4, freq='4AS') result = date_format('%Y')(x) assert result == ['2010', '2014', '2018', '2022'] x = [datetime(year=2005 + i, month=i, day=i) for i in range(1, 5)] result = date_format('%Y:%m:%d')(x) assert result == \ ['2006:01:01', '2007:02:02', '2008:03:03', '2009:04:04'] # Different timezones pct = pytz.timezone('US/Pacific') ug = pytz.timezone('Africa/Kampala') x = [datetime(2010, 1, 1, tzinfo=ug), datetime(2010, 1, 1, tzinfo=pct)] with pytest.warns(UserWarning): date_format()(x)
def _chart(self, df, checklist, selected_words): if df is None: return self.getErrorPlot( self.ERROR_MSG.format(word=selected_words)) p = ( #ggplot(df , aes(x=TR.COLS.DATE , y=TrendRank.Consts.VAL_NAME)) ggplot(df) #+ geom_tile(aes(x=TR.COLS.DATE , y=TrendRank.Consts.VAL_NAME , fill=TrendRank.Consts.VAR_NAME)) + geom_tile( aes(x=TR.COLS.DATE, y=TR.COLS.CHNL, fill=TrendRank.Consts.VAR_NAME)) + scale_x_datetime(labels=date_format('%m/%y')) #+ geom_point(aes(fill=TrendRank.Consts.VAR_NAME, alpha=TrendRank.Consts.VAL_NAME) , stroke=0) #+ geom_smooth(aes(group=TrendRank.Consts.VAR_NAME , color=TrendRank.Consts.VAR_NAME),se=False) #+ geom_line(aes(group=TrendRank.Consts.VAR_NAME , color=TrendRank.Consts.VAR_NAME)) #+ scale_y_discrete(limits = list(reversed(np.arange(len(selected_words))))) + ggtitle("Top Term over Time Across Categories") + THEME.mt + theme(figure_size=(20, 5), panel_grid_major=element_blank(), panel_grid_minor=element_blank())) # p = ggplot(df , aes(x=TR.COLS.DATE , y=TrendRank.Consts.VAR_NAME))\ # + geom_tile(aes(fill=TrendRank.Consts.VAL_NAME))\ # + facet_grid(f"~{TR.COLS.CHNL}")\ # + THEME.mt \ # + theme(figure_size=(20,5) , panel_grid_major=element_blank() , panel_grid_minor=element_blank()) return p
def one_day_graph(collect_date='20191015', gateway_id='ep18270334'): db = 'aihems_api_db' # db = 'aihems_service_db' conn = pymysql.connect( host='aihems-service-db.cnz3sewvscki.ap-northeast-2.rds.amazonaws.com', port=3306, user='******', passwd='#cslee1234', db=db, charset='utf8') sql = f""" SELECT COLLECT_DATE , COLLECT_TIME , ONOFF , case when POWER > 20 then 1 else 0 end POWER -- , POWER , ENERGY_DIFF FROM AH_USE_LOG_BYMINUTE WHERE 1=1 AND GATEWAY_ID = '{gateway_id}' AND COLLECT_DATE = '{collect_date}' """ df = pd.read_sql(sql, con=conn) df['date'] = df.COLLECT_DATE + ' ' + df.COLLECT_TIME # print(sql) df.date = pd.to_datetime(df.date) print(collect_date) return(ggplot(df, aes(x = 'date', y = 'POWER'))+geom_line()+\ scale_x_datetime(breaks=date_breaks('2 hours'),labels=date_format('%H')))
def __init__(self, **kwargs): # Permit the use of the general parameters for # specifying the format strings with suppress(KeyError): breaks = kwargs['breaks'] if isinstance(breaks, six.string_types): kwargs['breaks'] = date_breaks(breaks) with suppress(KeyError): minor_breaks = kwargs['minor_breaks'] if isinstance(minor_breaks, six.string_types): kwargs['minor_breaks'] = date_breaks(minor_breaks) # Using the more specific parameters take precedence with suppress(KeyError): breaks_fmt = kwargs.pop('date_breaks') kwargs['breaks'] = date_breaks(breaks_fmt) with suppress(KeyError): labels_fmt = kwargs.pop('date_labels') kwargs['labels'] = date_format(labels_fmt) with suppress(KeyError): minor_breaks_fmt = kwargs.pop('date_minor_breaks') kwargs['minor_breaks'] = date_breaks(minor_breaks_fmt) scale_continuous.__init__(self, **kwargs)
def plot_drawdowns(cumulative_returns, benchmark_cum_returns): """Any time the cumulative returns dips below the current cumulative maximum returns, it's a drawdown. Drawdowns are measured as a percentage of that maximum cumulative return, in effect, measured from peak equity.""" benchmark_drawdown = get_drawdown(benchmark_cum_returns) benchmark_drawdown = benchmark_drawdown.to_frame() benchmark_drawdown = benchmark_drawdown.rename(columns={"benchmark": "drawdown"}) benchmark_drawdown['key'] = "benchmark" benchmark_drawdown.index.name = 'date' benchmark_drawdown.reset_index(level=0, inplace=True) portfolio_drawdown = get_drawdown(cumulative_returns) portfolio_drawdown = portfolio_drawdown.to_frame() portfolio_drawdown['key'] = "portfolio" portfolio_drawdown = portfolio_drawdown.rename(columns={"returns": "drawdown"}) portfolio_drawdown.index.name = 'date' portfolio_drawdown.reset_index(level=0, inplace=True) mask = benchmark_drawdown.date.isin(portfolio_drawdown.date) benchmark_drawdown = benchmark_drawdown[mask] df = portfolio_drawdown.append(benchmark_drawdown) df.to_csv(data_path+portfolio_name +'drawdowns.csv', header = True) warnings.filterwarnings('ignore') d = (ggplot(df) + aes(x = 'date', y = 'drawdown', color='key', group='key') + geom_line() + scale_x_datetime(breaks=date_breaks('1 years'), labels=date_format('%Y')) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title=portfolio_name+'portfolio vs. benchmark', y = 'Drawdown % (change peak to trough)') ) d.save(filename=portfolio_name+'drawdowns.png', \ format="png", path=results_path, width = 6.4, height = 4.8, dpi=125) warnings.filterwarnings('default')
def test_date_format(): x = pd.date_range('1/1/2010', periods=4, freq='4AS') result = date_format('%Y')(x) assert result == ['2010', '2014', '2018', '2022'] x = [datetime(year=2005+i, month=i, day=i) for i in range(1, 5)] result = date_format('%Y:%m:%d')(x) assert result == \ ['2006:01:01', '2007:02:02', '2008:03:03', '2009:04:04'] # Different timezones pct = pytz.timezone('US/Pacific') ug = pytz.timezone('Africa/Kampala') x = [datetime(2010, 1, 1, tzinfo=ug), datetime(2010, 1, 1, tzinfo=pct)] with pytest.warns(UserWarning): date_format()(x)
def create_plot(self, columns): for col in columns: if col not in self.data.columns: raise ValueError('No column "%s" in the data' % col) try: from plotnine import ggplot, theme_bw, aes, geom_line, expand_limits, scale_x_datetime, ylab, facet_wrap, theme from mizani.formatters import date_format except ImportError: msg = """Package 'plotnine' is required for the plot functionnality. Try installing it with 'pip install plotnine'. """ raise RatatouilleDependencyError(msg) data = self.data.copy() if len(columns) > 0: data = data[['timestamp'] + columns] else: if 'hostname' in data: data.drop('hostname', axis=1, inplace=True) data['time_diff'] = data['timestamp'][1:].reset_index( drop=True) - data['timestamp'][:-1].reset_index(drop=True) time_step = data['time_diff'].median() breakpoints = list(data[data['time_diff'] > time_step * 10].timestamp) breakpoints = [ data['timestamp'].min(), *breakpoints, data['timestamp'].max() ] data = data.drop('time_diff', 1).melt('timestamp') import pandas if len(columns) > 0: data['variable'] = pandas.Categorical(data['variable'], categories=columns) plot = ggplot() + theme_bw() for min_t, max_t in zip(breakpoints[:-1], breakpoints[1:]): tmp = data[(data['timestamp'] > min_t) & (data['timestamp'] < max_t)] plot += geom_line(tmp, aes(x='timestamp', y='value', color='variable'), show_legend=False) plot += facet_wrap(['variable'], scales='free') timedelta = self.data.timestamp.max() - self.data.timestamp.min() if timedelta.days > 2: plot += scale_x_datetime(labels=date_format('%Y/%m/%d')) else: plot += scale_x_datetime(labels=date_format('%H:%M')) plot += ylab('Value') return plot
def create_plot( self, data, meta ): """Create plot w/ custom stylings.""" print(meta) title_by_signal = self.titles_by_signal.get( meta['signal'], self.unknown_title ) title_by_signal += "\n Data Source: %s" % meta['source'] ### x = np.array( data['values'] ) x = x.astype( np.float ) t = np.array( data['time'] ) d = { 'values': x, 'time': t } df = pd.DataFrame( data=d ) df['time'] = pd.to_datetime( df['time'] ) ### t0 = df['time'][0] plot_theme = theme( figure_size=( self.height, self.width ), panel_background=element_rect( fill="black" ), plot_background=element_rect( fill="gray" ), panel_grid_major_y=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_blank(), panel_grid_minor_x=element_blank(), title=element_text( color="black",size=20 ), axis_text_x=element_text( color="black",size=10 ), axis_text_y=element_text( color="black",size=15 ), axis_title_x=element_text( color="black",size=12 ), axis_title_y=element_text( color="black",size=15 ), ) g = (ggplot() + geom_line( df, aes( 'time','values' ), color="#76D115", size=2 ) + scale_x_datetime( labels = date_format( "%b %d %H:%M" ) ) + ggtitle( title_by_signal ) + xlab( "Universal Time" ) + ylab( "Proton Flux Unit : Particles $cm^{-2}s^{-1}sr^{-1}$" ) + plot_theme ) if meta['signal'] == 'P10': g = ( g + ylim ( 10**-1, 10**4 ) + scale_y_log10( breaks=[10**-1, 10**0, 10**1, 10**2, 10**3, 10**4] ) + geom_hline( yintercept=10**0, color="#E6C329", size=3 ) + annotate( geom="text", label="WARNING", x=t0 , y=1.15*10**0, ha="left", size=12, color = "#E6C329" ) + geom_hline( yintercept=10**1, color="#DE7F12", size=3 ) + annotate( geom="text", label="ALERT", x=t0 , y=1.15*10**1, ha="left", size=12, color = "#DE7F12" ) + geom_hline( yintercept=10**2, color="#B52914", size=3 ) + annotate( geom="text", label="CRITICAL", x=t0 , y=1.15*10**2, ha="left", size=12, color = "#B52914" ) ) return g
def test_empty_breaks(): x = [] assert custom_format()(x) == [] assert comma_format()(x) == [] assert currency_format()(x) == [] assert percent_format()(x) == [] assert scientific_format()(x) == [] assert date_format()(x) == [] assert mpl_format()(x) == [] assert log_format()(x) == [] assert timedelta_format()(x) == []
def plot_portfolio_vs_benchmark(cumulative_returns, benchmark_cum_returns): benchmark_cum_returns = benchmark_cum_returns.rename(columns={"benchmark": "returns"}) benchmark_cum_returns['key'] = "benchmark" cumulative_returns['key'] = "portfolio" cumulative_returns["returns"] = cumulative_returns["returns"] df = cumulative_returns.append(benchmark_cum_returns) df.index.name = 'date' df.reset_index(level=0, inplace=True) df['returns'] = df['returns']*100 warnings.filterwarnings('ignore') df.to_csv(data_path+portfolio_name +'returns.csv', header = True) r = (ggplot(df) + aes(x = 'date', y = 'returns', color='key', group='key') + geom_line() + scale_x_datetime(breaks=date_breaks('1 years'), labels=date_format('%Y')) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title=portfolio_name+'portfolio vs. benchmark', y = 'Returns %') ) r.save(filename=portfolio_name+'returns.png', \ format="png", path=results_path, width = 6.4, height = 4.8, dpi=125) warnings.filterwarnings('default')
def batch_plots(self): # First, put together active leak data and output for live plotting functionality (no AL plot here currently) dfs = self.active_leak_dfs for i in range(len(dfs)): n_cols = dfs[i].shape[1] dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1) dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1) dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1) dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1) dfs[i]['program'] = self.directories[i] # Move reference program to the top of the list for i, df in enumerate(dfs): if df['program'].iloc[0] == self.ref_program: dfs.insert(0, dfs.pop(i)) # Arrange dfs for plot 1 dfs_p1 = dfs.copy() for i in range(len(dfs_p1)): # Reshape dfs_p1[i] = pd.melt( dfs_p1[i], id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program']) # Combine dataframes into single dataframe for plotting df_p1 = dfs_p1[0] for i in dfs_p1[1:]: df_p1 = df_p1.append(i, ignore_index=True) # Output Emissions df for other uses (e.g. live plot) df_p1.to_csv(self.output_directory + 'mean_active_leaks.csv', index=True) # Now repeat for emissions (which will actually be used for batch plotting) dfs = self.emission_dfs for i in range(len(dfs)): n_cols = dfs[i].shape[1] dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1) dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1) dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1) dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1) dfs[i]['program'] = self.directories[i] # Move reference program to the top of the list for i, df in enumerate(dfs): if df['program'].iloc[0] == self.ref_program: dfs.insert(0, dfs.pop(i)) # Arrange dfs for plot 1 dfs_p1 = dfs.copy() for i in range(len(dfs_p1)): # Reshape dfs_p1[i] = pd.melt( dfs_p1[i], id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program']) # Combine dataframes into single dataframe for plotting df_p1 = dfs_p1[0] for i in dfs_p1[1:]: df_p1 = df_p1.append(i, ignore_index=True) # Output Emissions df for other uses (e.g. live plot) df_p1.to_csv(self.output_directory + 'mean_emissions.csv', index=True) # Make plots from list of dataframes - one entry per dataframe theme_set(theme_linedraw()) plot1 = ( ggplot(None) + aes('datetime', 'value', group='program') + geom_ribbon(df_p1, aes(ymin='low', ymax='high', fill='program'), alpha=0.2) + geom_line(df_p1, aes('datetime', 'mean', colour='program'), size=1) + ylab('Daily emissions (kg/site)') + xlab('') + scale_colour_hue(h=0.15, l=0.25, s=0.9) + scale_x_datetime(labels=date_format('%Y')) + scale_y_continuous(trans='log10') + ggtitle('To reduce uncertainty, use more simulations.') + labs(color='Program', fill='Program') + theme(panel_border=element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5))) plot1.save(self.output_directory + 'program_comparison.png', width=7, height=3, dpi=900) # Build relative mitigation plots dfs_p2 = dfs.copy() for i in dfs_p2[1:]: i['mean_dif'] = 0 i['std_dif'] = 0 i['mean_ratio'] = 0 i['std_ratio'] = 0 for j in range(len(i)): ref_mean = dfs_p2[0].loc[dfs_p2[0].index[j], 'mean'] ref_std = dfs_p2[0].loc[dfs_p2[0].index[j], 'std'] alt_mean = i.loc[i.index[j], 'mean'] alt_std = i.loc[i.index[j], 'std'] i.loc[i.index[j], 'mean_dif'] = alt_mean - ref_mean i.loc[i.index[j], 'std_dif'] = math.sqrt( math.pow(alt_std, 2) + math.pow(ref_std, 2)) i.loc[i.index[j], 'mean_ratio'] = alt_mean / ref_mean i.loc[i.index[j], 'std_ratio'] = math.sqrt( math.pow((alt_std / alt_mean), 2) + math.pow((ref_std / ref_mean), 2)) # Build plotting dataframe df_p2 = self.dates_trunc.copy().to_frame() df_p2['program'] = dfs_p2[1]['program'] df_p2['mean_dif'] = dfs_p2[1]['mean_dif'] df_p2['std_dif'] = dfs_p2[1]['std_dif'] df_p2['mean_ratio'] = dfs_p2[1]['mean_ratio'] df_p2['std_ratio'] = dfs_p2[1]['std_ratio'] df_p2['low_dif'] = dfs_p2[1]['mean_dif'] - 2 * dfs_p2[1]['std_dif'] df_p2['high_dif'] = dfs_p2[1]['mean_dif'] + 2 * dfs_p2[1]['std_dif'] df_p2['low_ratio'] = dfs_p2[1]['mean_ratio'] / ( dfs_p2[1]['mean_ratio'] + 2 * dfs_p2[1]['std_ratio']) df_p2['high_ratio'] = dfs_p2[1][ 'mean_ratio'] + 2 * dfs_p2[1]['std_ratio'] pd.options.mode.chained_assignment = None for i in dfs_p2[2:]: i['low_dif'] = i['mean_dif'] - 2 * i['std_dif'] i['high_dif'] = i['mean_dif'] + 2 * i['std_dif'] i['low_ratio'] = i['mean_ratio'] / (i['mean_ratio'] + 2 * i['std_ratio']) i['high_ratio'] = i['mean_ratio'] + 2 * i['std_ratio'] short_df = i[[ 'program', 'mean_dif', 'std_dif', 'low_dif', 'high_dif', 'mean_ratio', 'std_ratio', 'low_ratio', 'high_ratio' ]] short_df['datetime'] = np.array(self.dates_trunc) df_p2 = df_p2.append(short_df, ignore_index=True) # Make plot 2 plot2 = ( ggplot(None) + aes('datetime', 'mean_dif', group='program') + geom_ribbon(df_p2, aes(ymin='low_dif', ymax='high_dif', fill='program'), alpha=0.2) + geom_line( df_p2, aes('datetime', 'mean_dif', colour='program'), size=1) + ylab('Daily emissions difference (kg/site)') + xlab('') + scale_colour_hue(h=0.15, l=0.25, s=0.9) + scale_x_datetime(labels=date_format('%Y')) + ggtitle( 'Daily differences may be uncertain for small sample sizes') + # scale_y_continuous(trans='log10') + labs(color='Program', fill='Program') + theme(panel_border=element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5))) plot2.save(self.output_directory + 'relative_mitigation.png', width=7, height=3, dpi=900) # Make plot 3 plot3 = ( ggplot(None) + aes('datetime', 'mean_ratio', group='program') + geom_ribbon( df_p2, aes(ymin='low_ratio', ymax='high_ratio', fill='program'), alpha=0.2) + geom_hline(yintercept=1, size=0.5, colour='blue') + geom_line( df_p2, aes('datetime', 'mean_ratio', colour='program'), size=1) + ylab('Emissions ratio') + xlab('') + scale_colour_hue(h=0.15, l=0.25, s=0.9) + scale_x_datetime(labels=date_format('%Y')) + ggtitle( 'Blue line represents equivalence. \nIf uncertainty is high, use more simulations and/or sites. \nLook also at ratio of mean daily emissions over entire timeseries.' ) + labs(color='Program', fill='Program') + theme(panel_border=element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5))) plot3.save(self.output_directory + 'relative_mitigation2.png', width=7, height=3, dpi=900) return
def make_plots(leak_df, time_df, site_df, sim_n, spin_up, output_directory): """ This function makes a set of standard plots to output at end of simulation. """ # Temporarily mute warnings warnings.filterwarnings('ignore') theme_set(theme_linedraw()) # Chop off spin-up year (only for plots, still exists in raw output) time_df_adj = time_df.iloc[spin_up:, ] # Timeseries plots plot_time_1 = ( ggplot(time_df_adj, aes('datetime', 'daily_emissions_kg')) + geom_line(size=2) + ggtitle('Daily emissions from all sites (kg)') + ylab('') + xlab('') + scale_x_datetime(labels=date_format('%Y')) + theme(panel_border=element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5))) plot_time_1.save(output_directory + '/plot_time_emissions_' + sim_n + '.png', width=10, height=3, dpi=300) plot_time_2 = ( ggplot(time_df_adj, aes('datetime', 'active_leaks')) + geom_line(size=2) + ggtitle('Number of active leaks at all sites') + ylab('') + xlab('') + scale_x_datetime(labels=date_format('%Y')) + theme(panel_border=element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5))) plot_time_2.save(output_directory + '/plot_time_active_' + sim_n + '.png', width=10, height=3, dpi=300) # Site-level plots plot_site_1 = ( ggplot(site_df, aes('cum_frac_sites', 'cum_frac_emissions')) + geom_line(size=2) + theme(panel_border=element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5)) + xlab('Cumulative fraction of sites') + ylab('Cumulative fraction of emissions') + ggtitle('Empirical cumulative distribution of site-level emissions')) plot_site_1.save(output_directory + '/site_cum_dist_' + sim_n + '.png', width=5, height=4, dpi=300) # Leak plots plot_leak_1 = ( ggplot(leak_df, aes('days_active')) + geom_histogram(colour='gray') + theme(panel_border=element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5)) + ggtitle('Distribution of leak duration') + xlab('Number of days the leak was active') + ylab('Count')) plot_leak_1.save(output_directory + '/leak_active_hist' + sim_n + '.png', width=5, height=4, dpi=300) plot_leak_2 = ( ggplot(leak_df, aes('cum_frac_leaks', 'cum_frac_rate', colour='status')) + geom_line(size=2) + scale_colour_hue(h=0.15, l=0.25, s=0.9) + theme(panel_border=element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5)) + xlab('Cumulative fraction of leak sources') + ylab('Cumulative leak rate fraction') + ggtitle('Fractional cumulative distribution')) plot_leak_2.save(output_directory + '/leak_cum_dist1_' + sim_n + '.png', width=4, height=4, dpi=300) plot_leak_3 = ( ggplot(leak_df, aes('cum_frac_leaks', 'cum_rate', colour='status')) + geom_line(size=2) + scale_colour_hue(h=0.15, l=0.25, s=0.9) + theme(panel_border=element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5)) + scale_y_continuous(trans='log10') + xlab('Cumulative fraction of leak sources') + ylab('Cumulative emissions (kg/day)') + ggtitle('Absolute cumulative distribution')) plot_leak_3.save(output_directory + '/leak_cum_dist2_' + sim_n + '.png', width=4, height=4, dpi=300) return
def batch_plots(self): # First, put together active leak data and output for live plotting functionality (no AL plot here currently) dfs = self.active_leak_dfs for i in range(len(dfs)): n_cols = dfs[i].shape[1] dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1) dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1) dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1) dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1) dfs[i]['program'] = self.directories[i] # Move reference program to the top of the list for i, df in enumerate(dfs): if df['program'].iloc[0] == self.ref_program: dfs.insert(0, dfs.pop(i)) # Arrange dfs for plot 1 dfs_p1 = dfs.copy() for i in range(len(dfs_p1)): # Reshape dfs_p1[i] = pd.melt( dfs_p1[i], id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program']) # Combine dataframes into single dataframe for plotting df_p1 = dfs_p1[0] for i in dfs_p1[1:]: df_p1 = df_p1.append(i, ignore_index=True) # Output Emissions df for other uses (e.g. live plot) df_p1.to_csv(self.output_directory + 'mean_active_leaks.csv', index=True) # Now repeat for emissions (which will actually be used for batch plotting) dfs = self.emission_dfs for i in range(len(dfs)): n_cols = dfs[i].shape[1] dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1) dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1) dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1) dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1) dfs[i]['program'] = self.directories[i] # Move reference program to the top of the list for i, df in enumerate(dfs): if df['program'].iloc[0] == self.ref_program: dfs.insert(0, dfs.pop(i)) # Arrange dfs for plot 1 dfs_p1 = dfs.copy() for i in range(len(dfs_p1)): # Reshape dfs_p1[i] = pd.melt( dfs_p1[i], id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program']) # Combine dataframes into single dataframe for plotting df_p1 = dfs_p1[0] for i in dfs_p1[1:]: df_p1 = df_p1.append(i, ignore_index=True) # Output Emissions df for other uses (e.g. live plot) df_p1.to_csv(self.output_directory + 'mean_emissions.csv', index=True) # Make plots from list of dataframes - one entry per dataframe theme_set(theme_linedraw()) plot1 = ( ggplot(None) + aes('datetime', 'value', group='program') + geom_ribbon(df_p1, aes(ymin='low', ymax='high', fill='program'), alpha=0.2) + geom_line(df_p1, aes('datetime', 'mean', colour='program'), size=1) + ylab('Daily emissions (kg/site)') + xlab('') + scale_colour_hue(h=0.15, l=0.25, s=0.9) + scale_x_datetime(labels=date_format('%Y')) + scale_y_continuous(trans='log10') + ggtitle('To reduce uncertainty, use more simulations.') + labs(color='Program', fill='Program') + theme(panel_border=element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5))) plot1.save(self.output_directory + 'program_comparison.png', width=7, height=3, dpi=900) # Build relative mitigation plots dfs_p2 = dfs.copy() for i in dfs_p2[1:]: i['mean_dif'] = 0 i['std_dif'] = 0 i['mean_ratio'] = 0 i['std_ratio'] = 0 for j in range(len(i)): ref_mean = dfs_p2[0].loc[dfs_p2[0].index[j], 'mean'] ref_std = dfs_p2[0].loc[dfs_p2[0].index[j], 'std'] alt_mean = i.loc[i.index[j], 'mean'] alt_std = i.loc[i.index[j], 'std'] i.loc[i.index[j], 'mean_dif'] = alt_mean - ref_mean i.loc[i.index[j], 'std_dif'] = math.sqrt( math.pow(alt_std, 2) + math.pow(ref_std, 2)) i.loc[i.index[j], 'mean_ratio'] = alt_mean / ref_mean i.loc[i.index[j], 'std_ratio'] = math.sqrt( math.pow((alt_std / alt_mean), 2) + math.pow((ref_std / ref_mean), 2)) # Build plotting dataframe df_p2 = self.dates_trunc.copy().to_frame() df_p2['program'] = dfs_p2[1]['program'] df_p2['mean_dif'] = dfs_p2[1]['mean_dif'] df_p2['std_dif'] = dfs_p2[1]['std_dif'] df_p2['mean_ratio'] = dfs_p2[1]['mean_ratio'] df_p2['std_ratio'] = dfs_p2[1]['std_ratio'] df_p2['low_dif'] = dfs_p2[1]['mean_dif'] - 2 * dfs_p2[1]['std_dif'] df_p2['high_dif'] = dfs_p2[1]['mean_dif'] + 2 * dfs_p2[1]['std_dif'] df_p2['low_ratio'] = dfs_p2[1]['mean_ratio'] / ( dfs_p2[1]['mean_ratio'] + 2 * dfs_p2[1]['std_ratio']) df_p2['high_ratio'] = dfs_p2[1][ 'mean_ratio'] + 2 * dfs_p2[1]['std_ratio'] pd.options.mode.chained_assignment = None for i in dfs_p2[2:]: i['low_dif'] = i['mean_dif'] - 2 * i['std_dif'] i['high_dif'] = i['mean_dif'] + 2 * i['std_dif'] i['low_ratio'] = i['mean_ratio'] / (i['mean_ratio'] + 2 * i['std_ratio']) i['high_ratio'] = i['mean_ratio'] + 2 * i['std_ratio'] short_df = i[[ 'program', 'mean_dif', 'std_dif', 'low_dif', 'high_dif', 'mean_ratio', 'std_ratio', 'low_ratio', 'high_ratio' ]] short_df['datetime'] = np.array(self.dates_trunc) df_p2 = df_p2.append(short_df, ignore_index=True) # Make plot 2 plot2 = ( ggplot(None) + aes('datetime', 'mean_dif', group='program') + geom_ribbon(df_p2, aes(ymin='low_dif', ymax='high_dif', fill='program'), alpha=0.2) + geom_line( df_p2, aes('datetime', 'mean_dif', colour='program'), size=1) + ylab('Daily emissions difference (kg/site)') + xlab('') + scale_colour_hue(h=0.15, l=0.25, s=0.9) + scale_x_datetime(labels=date_format('%Y')) + ggtitle( 'Daily differences may be uncertain for small sample sizes') + # scale_y_continuous(trans='log10') + labs(color='Program', fill='Program') + theme(panel_border=element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5))) plot2.save(self.output_directory + 'relative_mitigation.png', width=7, height=3, dpi=900) # Make plot 3 plot3 = ( ggplot(None) + aes('datetime', 'mean_ratio', group='program') + geom_ribbon( df_p2, aes(ymin='low_ratio', ymax='high_ratio', fill='program'), alpha=0.2) + geom_hline(yintercept=1, size=0.5, colour='blue') + geom_line( df_p2, aes('datetime', 'mean_ratio', colour='program'), size=1) + ylab('Emissions ratio') + xlab('') + scale_colour_hue(h=0.15, l=0.25, s=0.9) + scale_x_datetime(labels=date_format('%Y')) + ggtitle( 'Blue line represents equivalence. \nIf uncertainty is high, use more simulations and/or sites. \nLook also at ratio of mean daily emissions over entire timeseries.' ) + labs(color='Program', fill='Program') + theme(panel_border=element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5))) plot3.save(self.output_directory + 'relative_mitigation2.png', width=7, height=3, dpi=900) ################################## ### Figure to compare costs #### dfs = self.cost_dfs for i in range(len(dfs)): n_cols = dfs[i].shape[1] dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1) dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1) dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1) dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1) dfs[i]['program'] = self.directories[i] # Move reference program to the top of the list for i, df in enumerate(dfs): if df['program'].iloc[0] == self.ref_program: dfs.insert(0, dfs.pop(i)) # Arrange dfs for plot 1 dfs_p1 = dfs.copy() for i in range(len(dfs_p1)): # Reshape dfs_p1[i] = pd.melt( dfs_p1[i], id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program']) # Combine dataframes into single dataframe for plotting df_p1 = dfs_p1[0] for i in dfs_p1[1:]: df_p1 = df_p1.append(i, ignore_index=True) # Output Emissions df for other uses (e.g. live plot) df_p1.to_csv(self.output_directory + 'rolling_cost_estimates.csv', index=True) # Make plots from list of dataframes - one entry per dataframe theme_set(theme_linedraw()) plot1 = ( ggplot(None) + aes('datetime', 'value', group='program') + geom_ribbon(df_p1, aes(ymin='low', ymax='high', fill='program'), alpha=0.2) + geom_line(df_p1, aes('datetime', 'mean', colour='program'), size=1) + ylab('Estimated cost per facility') + xlab('') + scale_colour_hue(h=0.15, l=0.25, s=0.9) + scale_x_datetime(labels=date_format('%Y')) + #scale_y_continuous(trans='log10') + labs(color='Program', fill='Program') + theme(panel_border=element_rect(colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5))) plot1.save(self.output_directory + 'cost_estimate_temporal.png', width=7, height=3, dpi=900) ######################################## ### Cost breakdown by program and method method_lists = [] for i in range(len(self.directories)): df = pd.read_csv(self.output_directory + self.directories[i] + "/timeseries_output_0.csv") df = df.filter(regex='cost$', axis=1) df = df.drop(columns=["total_daily_cost"]) method_lists.append(list(df)) costs = [[] for i in range(len(self.all_data))] for i in range(len(self.all_data)): for j in range(len(self.all_data[i])): simcosts = [] for k in range(len(method_lists[i])): timesteps = len(self.all_data[i][j][method_lists[i][k]]) simcosts.append( (sum(self.all_data[i][j][method_lists[i][k]]) / timesteps / self.n_sites) * 365) costs[i].append(simcosts) rows_list = [] for i in range(len(costs)): df_temp = pd.DataFrame(costs[i]) for j in range(len(df_temp.columns)): dict = {} dict.update({'Program': self.directories[i]}) dict.update({'Mean Cost': round(df_temp.iloc[:, j].mean())}) dict.update({'St. Dev.': df_temp.iloc[:, j].std()}) dict.update( {'Method': method_lists[i][j].replace('_cost', '')}) rows_list.append(dict) df = pd.DataFrame(rows_list) plot = (ggplot( df, aes(x='Program', y='Mean Cost', fill='Method', label='Mean Cost')) + geom_bar(stat="identity") + ylab('Cost per Site per Year') + xlab('Program') + scale_fill_hue(h=0.15, l=0.25, s=0.9) + geom_text(size=15, position=position_stack(vjust=0.5)) + theme(panel_border=element_rect( colour="black", fill=None, size=2), panel_grid_minor_x=element_blank(), panel_grid_major_x=element_blank(), panel_grid_minor_y=element_line( colour='black', linewidth=0.5, alpha=0.3), panel_grid_major_y=element_line( colour='black', linewidth=1, alpha=0.5))) plot.save(self.output_directory + 'cost_comparison.png', width=7, height=3, dpi=900) return
result_df['거래금액'] = pd.to_numeric(result_df['거래금액']) result_df['도로명시군구코드'] = pd.to_numeric(result_df['도로명시군구코드'], downcast='integer') result_df = pd.merge(left=result_df, right=gu_code_data, left_on='도로명시군구코드', right_on='코드').drop('코드', axis=1) result_df['년월'] = result_df['년'] + result_df['월'] result_df['년월'] = result_df['년월'].map(lambda x : datetime.datetime.strptime(x, '%Y%m')) chart_df = result_df.groupby(['년월', '구'])['거래금액'].agg('sum') chart_df = chart_df.reset_index() chart_df['거래금액'] = chart_df['거래금액'] * 0.0001 chart_df = chart_df.query('년월 != "2019-07-01"') # 19년 6월 실거래가 아직 일부만 반영되어 제외 #%% # 그래프 그리기 (ggplot(data=chart_df, mapping=aes(x = '년월', y = '거래금액', color='구')) + geom_line() + scale_x_datetime(breaks=date_breaks('6 months'), labels=date_format('%Y%m')) + scale_y_continuous(breaks=list(range(0, 20000, 2500)),labels = comma_format()) + labs(x='기간', y='거래금액 (단위:억원)', color='') + theme(text=element_text(family='Malgun Gothic')) ) #%% # 피벗 돌려서 파일로 저장하기 trade_vol_df = chart_df.pivot(index='년월', columns='구', values='거래금액') trade_vol_df.to_csv(file_path + 'apt_trade_vol_data.csv', encoding='euc-kr')
# COMMAND ---------- from plotnine import * from plotnine.data import meat from mizani.breaks import date_breaks from mizani.formatters import date_format spkDF = spark.sql("SELECT DAYTIME,EVENT_ID, LUBE_OIL_PRESS FROM rc_data_05 WHERE RUL BETWEEN 1 AND 1440 and MM in (10,20,30,40,50) " ) spkDFPD = spkDF.toPandas() #series = pdAll[['DAYTIME','LUBE_OIL_PRESS']] pn = ggplot(spkDFPD, aes('DAYTIME','LUBE_OIL_PRESS')) + \ geom_line(color='blue') + \ scale_x_date(breaks=date_breaks('1 years'), labels=date_format('%b %Y')) + \ scale_y_continuous() + theme_bw() + theme(figure_size=(12, 8)) # COMMAND ---------- display(pn.draw()) # COMMAND ---------- ### This step is for cleaning data using simple median value of each colum to the missing or unknown value of sensors. from pyspark.sql.functions import when def replaceByMedian(pySparkDF, columnList): for colName in columnList: med = pySparkDF.approxQuantile(colName, [0.5], 0.25) pySparkDF = pySparkDF.withColumn(colName, when(pySparkDF[colName].isNotNull(),pySparkDF[colName]).otherwise(med[0]))