Example #1
0
def worldbank_plot(
        df: pd.DataFrame,
        title: str,
        dates_are_yearly: bool,
        figure_size=(12, 6),
        add_points=False,
        **plot_kwargs,
) -> p9.ggplot:
    """
    Carefully written to support all worldbank plots, this method is the one place where the app needs themes, colour maps
    and various plot related settings. For sparse datasets it used geom_point() in addition to geom_line() in case the data
    is so sparse that lines cannot be drawn. Returns a ggplot instance or raises an exception if the dataframe is empty.
    """
    if df is None:
        print(f"No usable data/plot for {title}")
        raise Http404(f"No data for {title}")

    pct_na = (df["metric"].isnull().sum() / len(df)) * 100.0
    assert pct_na >= 0.0 and pct_na <= 100.0

    plot = (p9.ggplot(df, p9.aes("date", "metric", **plot_kwargs)) +
            p9.geom_path(size=1.2) +
            p9.scale_y_continuous(labels=label_shorten))
    if dates_are_yearly:
        plot += p9.scale_x_datetime(labels=date_format(
            "%Y"))  # yearly data? if so only print the year on the x-axis
    # if pct_na is too high, geom_path() may be unable to draw a line (each value is surrounded by nan preventing a path)
    # so we use geom_point() to highlight the sparse nature of the data
    if pct_na >= 30.0 or add_points or df["metric"].count() <= 3:
        plot += p9.geom_point(size=3.0)
    return user_theme(plot, y_axis_label="Value", figure_size=figure_size)
Example #2
0
def test_date_format():
    x = pd.date_range('1/1/2010', periods=4, freq='4AS')
    result = date_format('%Y')(x)
    assert result == ['2010', '2014', '2018', '2022']

    x = [datetime(year=2005 + i, month=i, day=i) for i in range(1, 5)]
    result = date_format('%Y:%m:%d')(x)
    assert result == \
        ['2006:01:01', '2007:02:02', '2008:03:03', '2009:04:04']

    # Different timezones
    pct = pytz.timezone('US/Pacific')
    ug = pytz.timezone('Africa/Kampala')
    x = [datetime(2010, 1, 1, tzinfo=ug), datetime(2010, 1, 1, tzinfo=pct)]
    with pytest.warns(UserWarning):
        date_format()(x)
Example #3
0
    def _chart(self, df, checklist, selected_words):

        if df is None:
            return self.getErrorPlot(
                self.ERROR_MSG.format(word=selected_words))

        p = (  #ggplot(df , aes(x=TR.COLS.DATE  , y=TrendRank.Consts.VAL_NAME))
            ggplot(df)
            #+ geom_tile(aes(x=TR.COLS.DATE , y=TrendRank.Consts.VAL_NAME , fill=TrendRank.Consts.VAR_NAME))
            + geom_tile(
                aes(x=TR.COLS.DATE,
                    y=TR.COLS.CHNL,
                    fill=TrendRank.Consts.VAR_NAME)) +
            scale_x_datetime(labels=date_format('%m/%y'))
            #+ geom_point(aes(fill=TrendRank.Consts.VAR_NAME, alpha=TrendRank.Consts.VAL_NAME) , stroke=0)
            #+ geom_smooth(aes(group=TrendRank.Consts.VAR_NAME , color=TrendRank.Consts.VAR_NAME),se=False)
            #+ geom_line(aes(group=TrendRank.Consts.VAR_NAME , color=TrendRank.Consts.VAR_NAME))
            #+ scale_y_discrete(limits = list(reversed(np.arange(len(selected_words)))))
            + ggtitle("Top Term over Time Across Categories") + THEME.mt +
            theme(figure_size=(20, 5),
                  panel_grid_major=element_blank(),
                  panel_grid_minor=element_blank()))

        # p = ggplot(df , aes(x=TR.COLS.DATE  , y=TrendRank.Consts.VAR_NAME))\
        #     + geom_tile(aes(fill=TrendRank.Consts.VAL_NAME))\
        #     + facet_grid(f"~{TR.COLS.CHNL}")\
        #     + THEME.mt \
        #     + theme(figure_size=(20,5) , panel_grid_major=element_blank() , panel_grid_minor=element_blank())

        return p
Example #4
0
def one_day_graph(collect_date='20191015', gateway_id='ep18270334'):
    db = 'aihems_api_db'

    # db = 'aihems_service_db'
    conn = pymysql.connect(
        host='aihems-service-db.cnz3sewvscki.ap-northeast-2.rds.amazonaws.com',
        port=3306,
        user='******',
        passwd='#cslee1234',
        db=db,
        charset='utf8')

    sql = f"""
    SELECT
        COLLECT_DATE
        , COLLECT_TIME
        , ONOFF
        , case when POWER > 20 then 1 else 0 end POWER
--        , POWER
        , ENERGY_DIFF
    FROM AH_USE_LOG_BYMINUTE
    WHERE 1=1 
    AND GATEWAY_ID = '{gateway_id}'
    AND COLLECT_DATE = '{collect_date}'
    """

    df = pd.read_sql(sql, con=conn)
    df['date'] = df.COLLECT_DATE + ' ' + df.COLLECT_TIME
    #     print(sql)
    df.date = pd.to_datetime(df.date)
    print(collect_date)
    return(ggplot(df, aes(x = 'date', y = 'POWER'))+geom_line()+\
        scale_x_datetime(breaks=date_breaks('2 hours'),labels=date_format('%H')))
Example #5
0
    def __init__(self, **kwargs):
        # Permit the use of the general parameters for
        # specifying the format strings
        with suppress(KeyError):
            breaks = kwargs['breaks']
            if isinstance(breaks, six.string_types):
                kwargs['breaks'] = date_breaks(breaks)

        with suppress(KeyError):
            minor_breaks = kwargs['minor_breaks']
            if isinstance(minor_breaks, six.string_types):
                kwargs['minor_breaks'] = date_breaks(minor_breaks)

        # Using the more specific parameters take precedence
        with suppress(KeyError):
            breaks_fmt = kwargs.pop('date_breaks')
            kwargs['breaks'] = date_breaks(breaks_fmt)

        with suppress(KeyError):
            labels_fmt = kwargs.pop('date_labels')
            kwargs['labels'] = date_format(labels_fmt)

        with suppress(KeyError):
            minor_breaks_fmt = kwargs.pop('date_minor_breaks')
            kwargs['minor_breaks'] = date_breaks(minor_breaks_fmt)

        scale_continuous.__init__(self, **kwargs)
def plot_drawdowns(cumulative_returns, benchmark_cum_returns):
    """Any time the cumulative returns dips below the current cumulative
    maximum returns, it's a drawdown. Drawdowns are measured as a percentage of
    that maximum cumulative return, in effect, measured from peak equity."""
    benchmark_drawdown = get_drawdown(benchmark_cum_returns)
    benchmark_drawdown = benchmark_drawdown.to_frame()
    benchmark_drawdown = benchmark_drawdown.rename(columns={"benchmark": "drawdown"})
    benchmark_drawdown['key'] = "benchmark"
    benchmark_drawdown.index.name = 'date'
    benchmark_drawdown.reset_index(level=0, inplace=True)
    portfolio_drawdown = get_drawdown(cumulative_returns)
    portfolio_drawdown = portfolio_drawdown.to_frame()
    portfolio_drawdown['key'] = "portfolio"
    portfolio_drawdown = portfolio_drawdown.rename(columns={"returns": "drawdown"})
    portfolio_drawdown.index.name = 'date'
    portfolio_drawdown.reset_index(level=0, inplace=True)
    mask = benchmark_drawdown.date.isin(portfolio_drawdown.date)
    benchmark_drawdown = benchmark_drawdown[mask]
    df = portfolio_drawdown.append(benchmark_drawdown)
    df.to_csv(data_path+portfolio_name
                        +'drawdowns.csv', header = True)
    warnings.filterwarnings('ignore')
    d = (ggplot(df)
         + aes(x = 'date', y = 'drawdown', color='key', group='key')
         + geom_line()
         + scale_x_datetime(breaks=date_breaks('1 years'), labels=date_format('%Y'))
         + theme(axis_text_x=element_text(rotation=90, hjust=1))
         + labs(title=portfolio_name+'portfolio vs. benchmark',
                y = 'Drawdown % (change peak to trough)')
         )
    d.save(filename=portfolio_name+'drawdowns.png', \
        format="png", path=results_path, width = 6.4, height = 4.8, dpi=125)
    warnings.filterwarnings('default')
Example #7
0
def test_date_format():
    x = pd.date_range('1/1/2010', periods=4, freq='4AS')
    result = date_format('%Y')(x)
    assert result == ['2010', '2014', '2018', '2022']

    x = [datetime(year=2005+i, month=i, day=i) for i in range(1, 5)]
    result = date_format('%Y:%m:%d')(x)
    assert result == \
        ['2006:01:01', '2007:02:02', '2008:03:03', '2009:04:04']

    # Different timezones
    pct = pytz.timezone('US/Pacific')
    ug = pytz.timezone('Africa/Kampala')
    x = [datetime(2010, 1, 1, tzinfo=ug),
         datetime(2010, 1, 1, tzinfo=pct)]
    with pytest.warns(UserWarning):
        date_format()(x)
Example #8
0
 def create_plot(self, columns):
     for col in columns:
         if col not in self.data.columns:
             raise ValueError('No column "%s" in the data' % col)
     try:
         from plotnine import ggplot, theme_bw, aes, geom_line, expand_limits, scale_x_datetime, ylab, facet_wrap, theme
         from mizani.formatters import date_format
     except ImportError:
         msg = """Package 'plotnine' is required for the plot functionnality.
         Try installing it with 'pip install plotnine'.
         """
         raise RatatouilleDependencyError(msg)
     data = self.data.copy()
     if len(columns) > 0:
         data = data[['timestamp'] + columns]
     else:
         if 'hostname' in data:
             data.drop('hostname', axis=1, inplace=True)
     data['time_diff'] = data['timestamp'][1:].reset_index(
         drop=True) - data['timestamp'][:-1].reset_index(drop=True)
     time_step = data['time_diff'].median()
     breakpoints = list(data[data['time_diff'] > time_step * 10].timestamp)
     breakpoints = [
         data['timestamp'].min(), *breakpoints, data['timestamp'].max()
     ]
     data = data.drop('time_diff', 1).melt('timestamp')
     import pandas
     if len(columns) > 0:
         data['variable'] = pandas.Categorical(data['variable'],
                                               categories=columns)
     plot = ggplot() + theme_bw()
     for min_t, max_t in zip(breakpoints[:-1], breakpoints[1:]):
         tmp = data[(data['timestamp'] > min_t)
                    & (data['timestamp'] < max_t)]
         plot += geom_line(tmp,
                           aes(x='timestamp', y='value', color='variable'),
                           show_legend=False)
     plot += facet_wrap(['variable'], scales='free')
     timedelta = self.data.timestamp.max() - self.data.timestamp.min()
     if timedelta.days > 2:
         plot += scale_x_datetime(labels=date_format('%Y/%m/%d'))
     else:
         plot += scale_x_datetime(labels=date_format('%H:%M'))
     plot += ylab('Value')
     return plot
Example #9
0
    def create_plot( self, data, meta ):
        """Create plot w/ custom stylings."""
        print(meta)
        title_by_signal = self.titles_by_signal.get( meta['signal'], self.unknown_title )
        title_by_signal += "\n Data Source: %s" % meta['source']

        ###

        x = np.array( data['values'] )
        x = x.astype( np.float )
        t = np.array( data['time'] )

        d = { 'values': x, 'time': t }
        df = pd.DataFrame( data=d )
        df['time'] = pd.to_datetime( df['time'] )

        ###

        t0 = df['time'][0]

        plot_theme = theme(
            figure_size=( self.height, self.width ),
            panel_background=element_rect( fill="black" ),
            plot_background=element_rect( fill="gray" ),
            panel_grid_major_y=element_blank(),
            panel_grid_major_x=element_blank(),
            panel_grid_minor_y=element_blank(),
            panel_grid_minor_x=element_blank(),
            title=element_text( color="black",size=20 ),
            axis_text_x=element_text( color="black",size=10 ),
            axis_text_y=element_text( color="black",size=15 ),
            axis_title_x=element_text( color="black",size=12 ),
            axis_title_y=element_text( color="black",size=15 ),
        )

        g = (ggplot()
            + geom_line( df, aes( 'time','values' ), color="#76D115", size=2 )
            + scale_x_datetime( labels = date_format( "%b %d %H:%M" ) )
            + ggtitle( title_by_signal ) 
            + xlab( "Universal Time" )  
            + ylab( "Proton Flux Unit : Particles $cm^{-2}s^{-1}sr^{-1}$" )
            + plot_theme 
        )

        if meta['signal'] == 'P10':
            g = ( g + ylim ( 10**-1, 10**4 )
            + scale_y_log10( breaks=[10**-1, 10**0, 10**1, 10**2, 10**3, 10**4] ) 
            + geom_hline( yintercept=10**0, color="#E6C329", size=3 )
            + annotate( geom="text", label="WARNING", x=t0 , y=1.15*10**0, ha="left", size=12, color = "#E6C329" )
            + geom_hline( yintercept=10**1, color="#DE7F12", size=3 )
            + annotate( geom="text", label="ALERT", x=t0 , y=1.15*10**1, ha="left", size=12, color = "#DE7F12" )
            + geom_hline( yintercept=10**2, color="#B52914", size=3 )
            + annotate( geom="text", label="CRITICAL", x=t0 , y=1.15*10**2, ha="left", size=12, color = "#B52914" )
        )

        return g
Example #10
0
def test_empty_breaks():
    x = []
    assert custom_format()(x) == []
    assert comma_format()(x) == []
    assert currency_format()(x) == []
    assert percent_format()(x) == []
    assert scientific_format()(x) == []
    assert date_format()(x) == []
    assert mpl_format()(x) == []
    assert log_format()(x) == []
    assert timedelta_format()(x) == []
Example #11
0
def test_empty_breaks():
    x = []
    assert custom_format()(x) == []
    assert comma_format()(x) == []
    assert currency_format()(x) == []
    assert percent_format()(x) == []
    assert scientific_format()(x) == []
    assert date_format()(x) == []
    assert mpl_format()(x) == []
    assert log_format()(x) == []
    assert timedelta_format()(x) == []
def plot_portfolio_vs_benchmark(cumulative_returns, benchmark_cum_returns):
    benchmark_cum_returns = benchmark_cum_returns.rename(columns={"benchmark": "returns"})
    benchmark_cum_returns['key'] = "benchmark"
    cumulative_returns['key'] = "portfolio"
    cumulative_returns["returns"] = cumulative_returns["returns"]
    df = cumulative_returns.append(benchmark_cum_returns)
    df.index.name = 'date'
    df.reset_index(level=0, inplace=True)
    df['returns'] = df['returns']*100
    warnings.filterwarnings('ignore')
    df.to_csv(data_path+portfolio_name
                        +'returns.csv', header = True)
    r = (ggplot(df)
         + aes(x = 'date', y = 'returns', color='key', group='key')
         + geom_line()
         + scale_x_datetime(breaks=date_breaks('1 years'), labels=date_format('%Y'))
         + theme(axis_text_x=element_text(rotation=90, hjust=1))
         + labs(title=portfolio_name+'portfolio vs. benchmark',
                y = 'Returns %')
         )
    r.save(filename=portfolio_name+'returns.png', \
            format="png", path=results_path, width = 6.4, height = 4.8, dpi=125)
    warnings.filterwarnings('default')
Example #13
0
    def batch_plots(self):

        # First, put together active leak data and output for live plotting functionality (no AL plot here currently)
        dfs = self.active_leak_dfs

        for i in range(len(dfs)):
            n_cols = dfs[i].shape[1]
            dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1)
            dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1)
            dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1)
            dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1)
            dfs[i]['program'] = self.directories[i]

        # Move reference program to the top of the list
        for i, df in enumerate(dfs):
            if df['program'].iloc[0] == self.ref_program:
                dfs.insert(0, dfs.pop(i))

        # Arrange dfs for plot 1
        dfs_p1 = dfs.copy()
        for i in range(len(dfs_p1)):
            # Reshape
            dfs_p1[i] = pd.melt(
                dfs_p1[i],
                id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program'])

        # Combine dataframes into single dataframe for plotting
        df_p1 = dfs_p1[0]
        for i in dfs_p1[1:]:
            df_p1 = df_p1.append(i, ignore_index=True)

        # Output Emissions df for other uses (e.g. live plot)
        df_p1.to_csv(self.output_directory + 'mean_active_leaks.csv',
                     index=True)

        # Now repeat for emissions (which will actually be used for batch plotting)
        dfs = self.emission_dfs

        for i in range(len(dfs)):
            n_cols = dfs[i].shape[1]
            dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1)
            dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1)
            dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1)
            dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1)
            dfs[i]['program'] = self.directories[i]

            # Move reference program to the top of the list
        for i, df in enumerate(dfs):
            if df['program'].iloc[0] == self.ref_program:
                dfs.insert(0, dfs.pop(i))

        # Arrange dfs for plot 1
        dfs_p1 = dfs.copy()
        for i in range(len(dfs_p1)):
            # Reshape
            dfs_p1[i] = pd.melt(
                dfs_p1[i],
                id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program'])

        # Combine dataframes into single dataframe for plotting
        df_p1 = dfs_p1[0]
        for i in dfs_p1[1:]:
            df_p1 = df_p1.append(i, ignore_index=True)

        # Output Emissions df for other uses (e.g. live plot)
        df_p1.to_csv(self.output_directory + 'mean_emissions.csv', index=True)

        # Make plots from list of dataframes - one entry per dataframe
        theme_set(theme_linedraw())
        plot1 = (
            ggplot(None) + aes('datetime', 'value', group='program') +
            geom_ribbon(df_p1,
                        aes(ymin='low', ymax='high', fill='program'),
                        alpha=0.2) +
            geom_line(df_p1, aes('datetime', 'mean', colour='program'),
                      size=1) + ylab('Daily emissions (kg/site)') + xlab('') +
            scale_colour_hue(h=0.15, l=0.25, s=0.9) +
            scale_x_datetime(labels=date_format('%Y')) +
            scale_y_continuous(trans='log10') +
            ggtitle('To reduce uncertainty, use more simulations.') +
            labs(color='Program', fill='Program') +
            theme(panel_border=element_rect(colour="black", fill=None, size=2),
                  panel_grid_minor_x=element_blank(),
                  panel_grid_major_x=element_blank(),
                  panel_grid_minor_y=element_line(
                      colour='black', linewidth=0.5, alpha=0.3),
                  panel_grid_major_y=element_line(
                      colour='black', linewidth=1, alpha=0.5)))
        plot1.save(self.output_directory + 'program_comparison.png',
                   width=7,
                   height=3,
                   dpi=900)

        # Build relative mitigation plots
        dfs_p2 = dfs.copy()

        for i in dfs_p2[1:]:
            i['mean_dif'] = 0
            i['std_dif'] = 0
            i['mean_ratio'] = 0
            i['std_ratio'] = 0
            for j in range(len(i)):
                ref_mean = dfs_p2[0].loc[dfs_p2[0].index[j], 'mean']
                ref_std = dfs_p2[0].loc[dfs_p2[0].index[j], 'std']
                alt_mean = i.loc[i.index[j], 'mean']
                alt_std = i.loc[i.index[j], 'std']

                i.loc[i.index[j], 'mean_dif'] = alt_mean - ref_mean
                i.loc[i.index[j], 'std_dif'] = math.sqrt(
                    math.pow(alt_std, 2) + math.pow(ref_std, 2))
                i.loc[i.index[j], 'mean_ratio'] = alt_mean / ref_mean
                i.loc[i.index[j], 'std_ratio'] = math.sqrt(
                    math.pow((alt_std / alt_mean), 2) +
                    math.pow((ref_std / ref_mean), 2))

        # Build plotting dataframe
        df_p2 = self.dates_trunc.copy().to_frame()
        df_p2['program'] = dfs_p2[1]['program']
        df_p2['mean_dif'] = dfs_p2[1]['mean_dif']
        df_p2['std_dif'] = dfs_p2[1]['std_dif']
        df_p2['mean_ratio'] = dfs_p2[1]['mean_ratio']
        df_p2['std_ratio'] = dfs_p2[1]['std_ratio']

        df_p2['low_dif'] = dfs_p2[1]['mean_dif'] - 2 * dfs_p2[1]['std_dif']
        df_p2['high_dif'] = dfs_p2[1]['mean_dif'] + 2 * dfs_p2[1]['std_dif']
        df_p2['low_ratio'] = dfs_p2[1]['mean_ratio'] / (
            dfs_p2[1]['mean_ratio'] + 2 * dfs_p2[1]['std_ratio'])
        df_p2['high_ratio'] = dfs_p2[1][
            'mean_ratio'] + 2 * dfs_p2[1]['std_ratio']

        pd.options.mode.chained_assignment = None
        for i in dfs_p2[2:]:
            i['low_dif'] = i['mean_dif'] - 2 * i['std_dif']
            i['high_dif'] = i['mean_dif'] + 2 * i['std_dif']
            i['low_ratio'] = i['mean_ratio'] / (i['mean_ratio'] +
                                                2 * i['std_ratio'])
            i['high_ratio'] = i['mean_ratio'] + 2 * i['std_ratio']
            short_df = i[[
                'program', 'mean_dif', 'std_dif', 'low_dif', 'high_dif',
                'mean_ratio', 'std_ratio', 'low_ratio', 'high_ratio'
            ]]
            short_df['datetime'] = np.array(self.dates_trunc)
            df_p2 = df_p2.append(short_df, ignore_index=True)

        # Make plot 2
        plot2 = (
            ggplot(None) + aes('datetime', 'mean_dif', group='program') +
            geom_ribbon(df_p2,
                        aes(ymin='low_dif', ymax='high_dif', fill='program'),
                        alpha=0.2) +
            geom_line(
                df_p2, aes('datetime', 'mean_dif', colour='program'), size=1) +
            ylab('Daily emissions difference (kg/site)') + xlab('') +
            scale_colour_hue(h=0.15, l=0.25, s=0.9) +
            scale_x_datetime(labels=date_format('%Y')) + ggtitle(
                'Daily differences may be uncertain for small sample sizes') +
            #        scale_y_continuous(trans='log10') +
            labs(color='Program', fill='Program') +
            theme(panel_border=element_rect(colour="black", fill=None, size=2),
                  panel_grid_minor_x=element_blank(),
                  panel_grid_major_x=element_blank(),
                  panel_grid_minor_y=element_line(
                      colour='black', linewidth=0.5, alpha=0.3),
                  panel_grid_major_y=element_line(
                      colour='black', linewidth=1, alpha=0.5)))
        plot2.save(self.output_directory + 'relative_mitigation.png',
                   width=7,
                   height=3,
                   dpi=900)

        # Make plot 3
        plot3 = (
            ggplot(None) + aes('datetime', 'mean_ratio', group='program') +
            geom_ribbon(
                df_p2,
                aes(ymin='low_ratio', ymax='high_ratio', fill='program'),
                alpha=0.2) +
            geom_hline(yintercept=1, size=0.5, colour='blue') + geom_line(
                df_p2, aes('datetime', 'mean_ratio', colour='program'),
                size=1) + ylab('Emissions ratio') + xlab('') +
            scale_colour_hue(h=0.15, l=0.25, s=0.9) +
            scale_x_datetime(labels=date_format('%Y')) + ggtitle(
                'Blue line represents equivalence. \nIf uncertainty is high, use more simulations and/or sites. \nLook also at ratio of mean daily emissions over entire timeseries.'
            ) + labs(color='Program', fill='Program') +
            theme(panel_border=element_rect(colour="black", fill=None, size=2),
                  panel_grid_minor_x=element_blank(),
                  panel_grid_major_x=element_blank(),
                  panel_grid_minor_y=element_line(
                      colour='black', linewidth=0.5, alpha=0.3),
                  panel_grid_major_y=element_line(
                      colour='black', linewidth=1, alpha=0.5)))
        plot3.save(self.output_directory + 'relative_mitigation2.png',
                   width=7,
                   height=3,
                   dpi=900)

        return
Example #14
0
def make_plots(leak_df, time_df, site_df, sim_n, spin_up, output_directory):
    """
    This function makes a set of standard plots to output at end of simulation.
    """
    # Temporarily mute warnings
    warnings.filterwarnings('ignore')
    theme_set(theme_linedraw())

    # Chop off spin-up year (only for plots, still exists in raw output)
    time_df_adj = time_df.iloc[spin_up:, ]

    # Timeseries plots
    plot_time_1 = (
        ggplot(time_df_adj, aes('datetime', 'daily_emissions_kg')) +
        geom_line(size=2) + ggtitle('Daily emissions from all sites (kg)') +
        ylab('') + xlab('') + scale_x_datetime(labels=date_format('%Y')) +
        theme(panel_border=element_rect(colour="black", fill=None, size=2),
              panel_grid_minor_x=element_blank(),
              panel_grid_major_x=element_blank(),
              panel_grid_minor_y=element_line(
                  colour='black', linewidth=0.5, alpha=0.3),
              panel_grid_major_y=element_line(
                  colour='black', linewidth=1, alpha=0.5)))

    plot_time_1.save(output_directory + '/plot_time_emissions_' + sim_n +
                     '.png',
                     width=10,
                     height=3,
                     dpi=300)

    plot_time_2 = (
        ggplot(time_df_adj, aes('datetime', 'active_leaks')) +
        geom_line(size=2) + ggtitle('Number of active leaks at all sites') +
        ylab('') + xlab('') + scale_x_datetime(labels=date_format('%Y')) +
        theme(panel_border=element_rect(colour="black", fill=None, size=2),
              panel_grid_minor_x=element_blank(),
              panel_grid_major_x=element_blank(),
              panel_grid_minor_y=element_line(
                  colour='black', linewidth=0.5, alpha=0.3),
              panel_grid_major_y=element_line(
                  colour='black', linewidth=1, alpha=0.5)))

    plot_time_2.save(output_directory + '/plot_time_active_' + sim_n + '.png',
                     width=10,
                     height=3,
                     dpi=300)

    # Site-level plots
    plot_site_1 = (
        ggplot(site_df, aes('cum_frac_sites', 'cum_frac_emissions')) +
        geom_line(size=2) +
        theme(panel_border=element_rect(colour="black", fill=None, size=2),
              panel_grid_minor_x=element_blank(),
              panel_grid_major_x=element_blank(),
              panel_grid_minor_y=element_line(
                  colour='black', linewidth=0.5, alpha=0.3),
              panel_grid_major_y=element_line(
                  colour='black', linewidth=1, alpha=0.5)) +
        xlab('Cumulative fraction of sites') +
        ylab('Cumulative fraction of emissions') +
        ggtitle('Empirical cumulative distribution of site-level emissions'))

    plot_site_1.save(output_directory + '/site_cum_dist_' + sim_n + '.png',
                     width=5,
                     height=4,
                     dpi=300)

    # Leak plots
    plot_leak_1 = (
        ggplot(leak_df, aes('days_active')) + geom_histogram(colour='gray') +
        theme(panel_border=element_rect(colour="black", fill=None, size=2),
              panel_grid_minor_x=element_blank(),
              panel_grid_major_x=element_blank(),
              panel_grid_minor_y=element_line(
                  colour='black', linewidth=0.5, alpha=0.3),
              panel_grid_major_y=element_line(
                  colour='black', linewidth=1, alpha=0.5)) +
        ggtitle('Distribution of leak duration') +
        xlab('Number of days the leak was active') + ylab('Count'))
    plot_leak_1.save(output_directory + '/leak_active_hist' + sim_n + '.png',
                     width=5,
                     height=4,
                     dpi=300)

    plot_leak_2 = (
        ggplot(leak_df, aes('cum_frac_leaks', 'cum_frac_rate',
                            colour='status')) + geom_line(size=2) +
        scale_colour_hue(h=0.15, l=0.25, s=0.9) +
        theme(panel_border=element_rect(colour="black", fill=None, size=2),
              panel_grid_minor_x=element_blank(),
              panel_grid_major_x=element_blank(),
              panel_grid_minor_y=element_line(
                  colour='black', linewidth=0.5, alpha=0.3),
              panel_grid_major_y=element_line(
                  colour='black', linewidth=1, alpha=0.5)) +
        xlab('Cumulative fraction of leak sources') +
        ylab('Cumulative leak rate fraction') +
        ggtitle('Fractional cumulative distribution'))

    plot_leak_2.save(output_directory + '/leak_cum_dist1_' + sim_n + '.png',
                     width=4,
                     height=4,
                     dpi=300)

    plot_leak_3 = (
        ggplot(leak_df, aes('cum_frac_leaks', 'cum_rate', colour='status')) +
        geom_line(size=2) + scale_colour_hue(h=0.15, l=0.25, s=0.9) +
        theme(panel_border=element_rect(colour="black", fill=None, size=2),
              panel_grid_minor_x=element_blank(),
              panel_grid_major_x=element_blank(),
              panel_grid_minor_y=element_line(
                  colour='black', linewidth=0.5, alpha=0.3),
              panel_grid_major_y=element_line(
                  colour='black', linewidth=1, alpha=0.5)) +
        scale_y_continuous(trans='log10') +
        xlab('Cumulative fraction of leak sources') +
        ylab('Cumulative emissions (kg/day)') +
        ggtitle('Absolute cumulative distribution'))

    plot_leak_3.save(output_directory + '/leak_cum_dist2_' + sim_n + '.png',
                     width=4,
                     height=4,
                     dpi=300)

    return
Example #15
0
    def batch_plots(self):

        # First, put together active leak data and output for live plotting functionality (no AL plot here currently)
        dfs = self.active_leak_dfs

        for i in range(len(dfs)):
            n_cols = dfs[i].shape[1]
            dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1)
            dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1)
            dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1)
            dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1)
            dfs[i]['program'] = self.directories[i]

        # Move reference program to the top of the list
        for i, df in enumerate(dfs):
            if df['program'].iloc[0] == self.ref_program:
                dfs.insert(0, dfs.pop(i))

        # Arrange dfs for plot 1
        dfs_p1 = dfs.copy()
        for i in range(len(dfs_p1)):
            # Reshape
            dfs_p1[i] = pd.melt(
                dfs_p1[i],
                id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program'])

        # Combine dataframes into single dataframe for plotting
        df_p1 = dfs_p1[0]
        for i in dfs_p1[1:]:
            df_p1 = df_p1.append(i, ignore_index=True)

        # Output Emissions df for other uses (e.g. live plot)
        df_p1.to_csv(self.output_directory + 'mean_active_leaks.csv',
                     index=True)

        # Now repeat for emissions (which will actually be used for batch plotting)
        dfs = self.emission_dfs

        for i in range(len(dfs)):
            n_cols = dfs[i].shape[1]
            dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1)
            dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1)
            dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1)
            dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1)
            dfs[i]['program'] = self.directories[i]

            # Move reference program to the top of the list
        for i, df in enumerate(dfs):
            if df['program'].iloc[0] == self.ref_program:
                dfs.insert(0, dfs.pop(i))

        # Arrange dfs for plot 1
        dfs_p1 = dfs.copy()
        for i in range(len(dfs_p1)):
            # Reshape
            dfs_p1[i] = pd.melt(
                dfs_p1[i],
                id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program'])

        # Combine dataframes into single dataframe for plotting
        df_p1 = dfs_p1[0]
        for i in dfs_p1[1:]:
            df_p1 = df_p1.append(i, ignore_index=True)

        # Output Emissions df for other uses (e.g. live plot)
        df_p1.to_csv(self.output_directory + 'mean_emissions.csv', index=True)

        # Make plots from list of dataframes - one entry per dataframe
        theme_set(theme_linedraw())
        plot1 = (
            ggplot(None) + aes('datetime', 'value', group='program') +
            geom_ribbon(df_p1,
                        aes(ymin='low', ymax='high', fill='program'),
                        alpha=0.2) +
            geom_line(df_p1, aes('datetime', 'mean', colour='program'),
                      size=1) + ylab('Daily emissions (kg/site)') + xlab('') +
            scale_colour_hue(h=0.15, l=0.25, s=0.9) +
            scale_x_datetime(labels=date_format('%Y')) +
            scale_y_continuous(trans='log10') +
            ggtitle('To reduce uncertainty, use more simulations.') +
            labs(color='Program', fill='Program') +
            theme(panel_border=element_rect(colour="black", fill=None, size=2),
                  panel_grid_minor_x=element_blank(),
                  panel_grid_major_x=element_blank(),
                  panel_grid_minor_y=element_line(
                      colour='black', linewidth=0.5, alpha=0.3),
                  panel_grid_major_y=element_line(
                      colour='black', linewidth=1, alpha=0.5)))
        plot1.save(self.output_directory + 'program_comparison.png',
                   width=7,
                   height=3,
                   dpi=900)

        # Build relative mitigation plots
        dfs_p2 = dfs.copy()

        for i in dfs_p2[1:]:
            i['mean_dif'] = 0
            i['std_dif'] = 0
            i['mean_ratio'] = 0
            i['std_ratio'] = 0
            for j in range(len(i)):
                ref_mean = dfs_p2[0].loc[dfs_p2[0].index[j], 'mean']
                ref_std = dfs_p2[0].loc[dfs_p2[0].index[j], 'std']
                alt_mean = i.loc[i.index[j], 'mean']
                alt_std = i.loc[i.index[j], 'std']

                i.loc[i.index[j], 'mean_dif'] = alt_mean - ref_mean
                i.loc[i.index[j], 'std_dif'] = math.sqrt(
                    math.pow(alt_std, 2) + math.pow(ref_std, 2))
                i.loc[i.index[j], 'mean_ratio'] = alt_mean / ref_mean
                i.loc[i.index[j], 'std_ratio'] = math.sqrt(
                    math.pow((alt_std / alt_mean), 2) +
                    math.pow((ref_std / ref_mean), 2))

        # Build plotting dataframe
        df_p2 = self.dates_trunc.copy().to_frame()
        df_p2['program'] = dfs_p2[1]['program']
        df_p2['mean_dif'] = dfs_p2[1]['mean_dif']
        df_p2['std_dif'] = dfs_p2[1]['std_dif']
        df_p2['mean_ratio'] = dfs_p2[1]['mean_ratio']
        df_p2['std_ratio'] = dfs_p2[1]['std_ratio']

        df_p2['low_dif'] = dfs_p2[1]['mean_dif'] - 2 * dfs_p2[1]['std_dif']
        df_p2['high_dif'] = dfs_p2[1]['mean_dif'] + 2 * dfs_p2[1]['std_dif']
        df_p2['low_ratio'] = dfs_p2[1]['mean_ratio'] / (
            dfs_p2[1]['mean_ratio'] + 2 * dfs_p2[1]['std_ratio'])
        df_p2['high_ratio'] = dfs_p2[1][
            'mean_ratio'] + 2 * dfs_p2[1]['std_ratio']

        pd.options.mode.chained_assignment = None
        for i in dfs_p2[2:]:
            i['low_dif'] = i['mean_dif'] - 2 * i['std_dif']
            i['high_dif'] = i['mean_dif'] + 2 * i['std_dif']
            i['low_ratio'] = i['mean_ratio'] / (i['mean_ratio'] +
                                                2 * i['std_ratio'])
            i['high_ratio'] = i['mean_ratio'] + 2 * i['std_ratio']
            short_df = i[[
                'program', 'mean_dif', 'std_dif', 'low_dif', 'high_dif',
                'mean_ratio', 'std_ratio', 'low_ratio', 'high_ratio'
            ]]
            short_df['datetime'] = np.array(self.dates_trunc)
            df_p2 = df_p2.append(short_df, ignore_index=True)

        # Make plot 2
        plot2 = (
            ggplot(None) + aes('datetime', 'mean_dif', group='program') +
            geom_ribbon(df_p2,
                        aes(ymin='low_dif', ymax='high_dif', fill='program'),
                        alpha=0.2) +
            geom_line(
                df_p2, aes('datetime', 'mean_dif', colour='program'), size=1) +
            ylab('Daily emissions difference (kg/site)') + xlab('') +
            scale_colour_hue(h=0.15, l=0.25, s=0.9) +
            scale_x_datetime(labels=date_format('%Y')) + ggtitle(
                'Daily differences may be uncertain for small sample sizes') +
            #        scale_y_continuous(trans='log10') +
            labs(color='Program', fill='Program') +
            theme(panel_border=element_rect(colour="black", fill=None, size=2),
                  panel_grid_minor_x=element_blank(),
                  panel_grid_major_x=element_blank(),
                  panel_grid_minor_y=element_line(
                      colour='black', linewidth=0.5, alpha=0.3),
                  panel_grid_major_y=element_line(
                      colour='black', linewidth=1, alpha=0.5)))
        plot2.save(self.output_directory + 'relative_mitigation.png',
                   width=7,
                   height=3,
                   dpi=900)

        # Make plot 3
        plot3 = (
            ggplot(None) + aes('datetime', 'mean_ratio', group='program') +
            geom_ribbon(
                df_p2,
                aes(ymin='low_ratio', ymax='high_ratio', fill='program'),
                alpha=0.2) +
            geom_hline(yintercept=1, size=0.5, colour='blue') + geom_line(
                df_p2, aes('datetime', 'mean_ratio', colour='program'),
                size=1) + ylab('Emissions ratio') + xlab('') +
            scale_colour_hue(h=0.15, l=0.25, s=0.9) +
            scale_x_datetime(labels=date_format('%Y')) + ggtitle(
                'Blue line represents equivalence. \nIf uncertainty is high, use more simulations and/or sites. \nLook also at ratio of mean daily emissions over entire timeseries.'
            ) + labs(color='Program', fill='Program') +
            theme(panel_border=element_rect(colour="black", fill=None, size=2),
                  panel_grid_minor_x=element_blank(),
                  panel_grid_major_x=element_blank(),
                  panel_grid_minor_y=element_line(
                      colour='black', linewidth=0.5, alpha=0.3),
                  panel_grid_major_y=element_line(
                      colour='black', linewidth=1, alpha=0.5)))
        plot3.save(self.output_directory + 'relative_mitigation2.png',
                   width=7,
                   height=3,
                   dpi=900)

        ##################################
        ### Figure to compare costs ####
        dfs = self.cost_dfs

        for i in range(len(dfs)):
            n_cols = dfs[i].shape[1]
            dfs[i]['mean'] = dfs[i].iloc[:, 0:n_cols].mean(axis=1)
            dfs[i]['std'] = dfs[i].iloc[:, 0:n_cols].std(axis=1)
            dfs[i]['low'] = dfs[i].iloc[:, 0:n_cols].quantile(0.025, axis=1)
            dfs[i]['high'] = dfs[i].iloc[:, 0:n_cols].quantile(0.975, axis=1)
            dfs[i]['program'] = self.directories[i]

        # Move reference program to the top of the list
        for i, df in enumerate(dfs):
            if df['program'].iloc[0] == self.ref_program:
                dfs.insert(0, dfs.pop(i))

        # Arrange dfs for plot 1
        dfs_p1 = dfs.copy()
        for i in range(len(dfs_p1)):
            # Reshape
            dfs_p1[i] = pd.melt(
                dfs_p1[i],
                id_vars=['datetime', 'mean', 'std', 'low', 'high', 'program'])

        # Combine dataframes into single dataframe for plotting
        df_p1 = dfs_p1[0]
        for i in dfs_p1[1:]:
            df_p1 = df_p1.append(i, ignore_index=True)

        # Output Emissions df for other uses (e.g. live plot)
        df_p1.to_csv(self.output_directory + 'rolling_cost_estimates.csv',
                     index=True)

        # Make plots from list of dataframes - one entry per dataframe
        theme_set(theme_linedraw())
        plot1 = (
            ggplot(None) + aes('datetime', 'value', group='program') +
            geom_ribbon(df_p1,
                        aes(ymin='low', ymax='high', fill='program'),
                        alpha=0.2) +
            geom_line(df_p1, aes('datetime', 'mean', colour='program'),
                      size=1) + ylab('Estimated cost per facility') +
            xlab('') + scale_colour_hue(h=0.15, l=0.25, s=0.9) +
            scale_x_datetime(labels=date_format('%Y')) +
            #scale_y_continuous(trans='log10') +
            labs(color='Program', fill='Program') +
            theme(panel_border=element_rect(colour="black", fill=None, size=2),
                  panel_grid_minor_x=element_blank(),
                  panel_grid_major_x=element_blank(),
                  panel_grid_minor_y=element_line(
                      colour='black', linewidth=0.5, alpha=0.3),
                  panel_grid_major_y=element_line(
                      colour='black', linewidth=1, alpha=0.5)))
        plot1.save(self.output_directory + 'cost_estimate_temporal.png',
                   width=7,
                   height=3,
                   dpi=900)

        ########################################
        ### Cost breakdown by program and method
        method_lists = []
        for i in range(len(self.directories)):
            df = pd.read_csv(self.output_directory + self.directories[i] +
                             "/timeseries_output_0.csv")
            df = df.filter(regex='cost$', axis=1)
            df = df.drop(columns=["total_daily_cost"])
            method_lists.append(list(df))

        costs = [[] for i in range(len(self.all_data))]
        for i in range(len(self.all_data)):
            for j in range(len(self.all_data[i])):
                simcosts = []
                for k in range(len(method_lists[i])):
                    timesteps = len(self.all_data[i][j][method_lists[i][k]])
                    simcosts.append(
                        (sum(self.all_data[i][j][method_lists[i][k]]) /
                         timesteps / self.n_sites) * 365)
                costs[i].append(simcosts)

        rows_list = []
        for i in range(len(costs)):
            df_temp = pd.DataFrame(costs[i])
            for j in range(len(df_temp.columns)):
                dict = {}
                dict.update({'Program': self.directories[i]})
                dict.update({'Mean Cost': round(df_temp.iloc[:, j].mean())})
                dict.update({'St. Dev.': df_temp.iloc[:, j].std()})
                dict.update(
                    {'Method': method_lists[i][j].replace('_cost', '')})
                rows_list.append(dict)
        df = pd.DataFrame(rows_list)

        plot = (ggplot(
            df,
            aes(x='Program', y='Mean Cost', fill='Method', label='Mean Cost'))
                + geom_bar(stat="identity") + ylab('Cost per Site per Year') +
                xlab('Program') + scale_fill_hue(h=0.15, l=0.25, s=0.9) +
                geom_text(size=15, position=position_stack(vjust=0.5)) +
                theme(panel_border=element_rect(
                    colour="black", fill=None, size=2),
                      panel_grid_minor_x=element_blank(),
                      panel_grid_major_x=element_blank(),
                      panel_grid_minor_y=element_line(
                          colour='black', linewidth=0.5, alpha=0.3),
                      panel_grid_major_y=element_line(
                          colour='black', linewidth=1, alpha=0.5)))
        plot.save(self.output_directory + 'cost_comparison.png',
                  width=7,
                  height=3,
                  dpi=900)

        return
result_df['거래금액'] = pd.to_numeric(result_df['거래금액'])
result_df['도로명시군구코드'] = pd.to_numeric(result_df['도로명시군구코드'], downcast='integer')
result_df = pd.merge(left=result_df, 
                     right=gu_code_data, 
                     left_on='도로명시군구코드', 
                     right_on='코드').drop('코드', axis=1)
result_df['년월'] = result_df['년'] + result_df['월']
result_df['년월'] = result_df['년월'].map(lambda x : datetime.datetime.strptime(x, '%Y%m'))

chart_df = result_df.groupby(['년월', '구'])['거래금액'].agg('sum')
chart_df = chart_df.reset_index()
chart_df['거래금액'] = chart_df['거래금액'] * 0.0001
chart_df = chart_df.query('년월 != "2019-07-01"')   # 19년 6월 실거래가 아직 일부만 반영되어 제외

#%%
# 그래프 그리기

(ggplot(data=chart_df, mapping=aes(x = '년월', y = '거래금액', color='구'))
 + geom_line()
 + scale_x_datetime(breaks=date_breaks('6 months'), labels=date_format('%Y%m'))
 + scale_y_continuous(breaks=list(range(0, 20000, 2500)),labels = comma_format())
 + labs(x='기간', y='거래금액 (단위:억원)', color='')
 + theme(text=element_text(family='Malgun Gothic'))
)

#%%
# 피벗 돌려서 파일로 저장하기

trade_vol_df = chart_df.pivot(index='년월', columns='구', values='거래금액')
trade_vol_df.to_csv(file_path + 'apt_trade_vol_data.csv', encoding='euc-kr')
# COMMAND ----------

from plotnine import *
from plotnine.data import meat
from mizani.breaks import date_breaks
from mizani.formatters import date_format

spkDF = spark.sql("SELECT DAYTIME,EVENT_ID, LUBE_OIL_PRESS FROM rc_data_05 WHERE RUL BETWEEN 1 AND 1440 and MM in (10,20,30,40,50) " )
spkDFPD = spkDF.toPandas()

#series = pdAll[['DAYTIME','LUBE_OIL_PRESS']]

pn = ggplot(spkDFPD, aes('DAYTIME','LUBE_OIL_PRESS')) + \
    geom_line(color='blue') + \
    scale_x_date(breaks=date_breaks('1 years'), labels=date_format('%b %Y')) + \
    scale_y_continuous() + theme_bw() + theme(figure_size=(12, 8))

# COMMAND ----------

display(pn.draw())

# COMMAND ----------

### This step is for cleaning data using simple median value of each colum to the missing or unknown value of sensors.
from pyspark.sql.functions import when

def replaceByMedian(pySparkDF, columnList):
    for colName in columnList:
        med = pySparkDF.approxQuantile(colName, [0.5], 0.25)
        pySparkDF = pySparkDF.withColumn(colName, when(pySparkDF[colName].isNotNull(),pySparkDF[colName]).otherwise(med[0]))