Beispiel #1
0
def test__can_insert_row():
    """ I can insert a new row into a bitemp ts and it comes back when selecting the latest data
    """
    df = get_bitemporal_test_data()
    df = insert_at(df, dt('2014-01-03'), [[9, 90]])
    assert len(df) == 9
    df = groupby_asof(df)
    assert len(df) == 4
    assert df.loc[dt('2014-01-03')]['OPEN'] == 9
    assert df.loc[dt('2014-01-03')]['CLOSE'] == 90
Beispiel #2
0
def test__can_insert_row():
    """ I can insert a new row into a bitemp ts and it comes back when selecting the latest data
    """
    df = get_bitemporal_test_data()
    df = insert_at(df, dt('2014-01-03'), [[9, 90]])
    assert len(df) == 9
    df = groupby_asof(df)
    assert len(df) == 4
    assert df.loc[dt('2014-01-03')]['OPEN'] == 9
    assert df.loc[dt('2014-01-03')]['CLOSE'] == 90
Beispiel #3
0
    def preprocess_years(self):

        df_day = []
        df_month = []
        df_year = []
        for year in ['2009', '2010', '2011', '2012', '2013']:
            print('Preprocessing data in {}...'.format(year))
            df_loaded = self.load_csv(
                join(cfg.source_path,
                     'z_hack_transaction_{}_new.csv'.format(year)))
            df_day.append(self.preprocess(df_loaded, 'day'))
            df_month.append(self.preprocess(df_loaded, 'month'))
            df_year.append(self.preprocess(df_loaded, 'year'))

        print('Concatenating data...')
        df_total_day: pd.DataFrame = pd.concat(df_day)
        df_total_month: pd.DataFrame = pd.concat(df_month)
        df_total_year: pd.DataFrame = pd.concat(df_year)

        print('Filling missing data...')
        start = dt(2009, 1, 4)
        end = dt(2013, 12, 31)

        df_day_work = df_total_day.reindex(
            pd.bdate_range(start, end).rename('DATE'))
        df_day_w_ff = self.fill_nan(df_day_work, 'ff')
        df_day_w_bf = self.fill_nan(df_day_work, 'bf')
        df_day_w_avg = self.fill_nan(df_day_work, 'avg')
        df_day_w_line = self.fill_nan(df_day_work, 'line')

        df_day_all = df_total_day.reindex(
            pd.date_range(start, end).rename('DATE'))
        df_day_a_ff = self.fill_nan(df_day_all, 'ff')
        df_day_a_bf = self.fill_nan(df_day_all, 'bf')
        df_day_a_avg = self.fill_nan(df_day_all, 'avg')
        df_day_a_line = self.fill_nan(df_day_all, 'line')

        print('Writing data to CSV...')
        self.save_to_csv(df_total_day, 'day')
        self.save_to_csv(df_total_month, 'month')
        self.save_to_csv(df_total_year, 'year')
        self.save_to_csv(df_day_work, 'day_work')
        self.save_to_csv(df_day_w_ff, 'day_w_ff')
        self.save_to_csv(df_day_w_bf, 'day_w_bf')
        self.save_to_csv(df_day_w_avg, 'day_w_avg')
        self.save_to_csv(df_day_w_line, 'day_w_line')
        self.save_to_csv(df_day_all, 'day_all')
        self.save_to_csv(df_day_a_ff, 'day_a_ff')
        self.save_to_csv(df_day_a_bf, 'day_a_bf')
        self.save_to_csv(df_day_a_avg, 'day_a_avg')
        self.save_to_csv(df_day_a_line, 'day_a_line')
Beispiel #4
0
def test__get_ts__asof_datetime():
    """  I can get a timeseries as-of a particular point in time
    """
    df = groupby_asof(get_bitemporal_test_data(), as_of=dt('2015-01-05'))
    assert len(df) == 3
    assert all(df['OPEN'] == [1.1, 2.1, 3.0])
    assert all(df['CLOSE'] == [10.1, 20.1, 30.0])
Beispiel #5
0
def test__get_ts__asof_datetime():
    """  I can get a timeseries as-of a particular point in time
    """
    df = groupby_asof(get_bitemporal_test_data(), as_of=dt('2015-01-05'))
    assert len(df) == 3
    assert all(df['OPEN'] == [1.1, 2.1, 3.0])
    assert all(df['CLOSE'] == [10.1, 20.1, 30.0])
Beispiel #6
0
def read_ca(file_directory, model):
    central_agents_df = pd.read_excel(io=file_directory,
                                      sheet_name='central_agents',
                                      header=1)
    ca_scd_df = pd.read_excel(io=file_directory,
                              sheet_name='central_agents_schedule',
                              header=2)

    central_agents_df.replace('None', np.nan, True)

    ca_scd_df['start'] = dt(ca_scd_df['start'], format='%d/%m/%Y').dt.date
    ca_scd_df['end'] = dt(ca_scd_df['end'], format='%d/%m/%Y').dt.date

    unique_start = ca_scd_df.start.unique()
    unique_end = ca_scd_df.end.unique()

    dict_start = {
        date: list(ca_scd_df.loc[ca_scd_df['start'] == date].central_agent)
        for date in unique_start
    }
    dict_start = dict(sorted(dict_start.items()))
    dict_end = {
        date: list(ca_scd_df.loc[ca_scd_df['end'] == date].central_agent)
        for date in unique_end
    }
    dict_end = dict(sorted(dict_end.items()))
    central_agents_schedule_dict = {'start': dict_start, 'end': dict_end}

    dict_map = {True: 'active', False: 'passive'}
    central_agents = {}  # {'active': [], 'passive': []}
    central_agents_passive = []
    for index, row in central_agents_df.iterrows():
        ca = CentralAgent(index, model)
        ca.active = central_agents_df.active[index]
        ca.name = central_agents_df.name[index]
        ca.range = central_agents_df.range[index]
        ca.opinion = row.tolist()[3:]
        central_agents[ca.name] = ca

    return central_agents, central_agents_schedule_dict
Beispiel #7
0
def download_and_read_data(country, deaths=False):
    # Download data
    url = f"https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_{'deaths' if deaths else 'confirmed'}_global.csv"
    df_file = "data.csv"
    urllib.request.urlretrieve(url, filename=df_file)

    # Read data
    df = pd.read_csv(df_file)
    not_time = ['Province/State', 'Country/Region', 'Lat', 'Long']
    df_country = df[df["Country/Region"] == country].drop(not_time, axis=1).T

    df_country.index = dt(df_country.index)

    return df_country
Beispiel #8
0
def main(country, start_date, look_ahead, post, deaths):
    # Parameters
    confidence = 95
    look_ahead = look_ahead * timedelta(days=1)

    df = download_and_read_data(country, deaths=deaths)
    message = ""
    message += f"Latest measurement from `{df.index[-1]}`\n"

    valid_indices = df.index >= dt(start_date)
    X = df[valid_indices].index
    X_int = date2int(X)
    y_log = np.log(df[valid_indices])

    # Fit model
    linreg = sklearn.linear_model.LinearRegression()
    linreg.fit(X_int, y_log)
    r2 = linreg.score(X_int, y_log)
    message += f"Fitted linear regression with R^2={r2:.3f}\n"

    factor = coef2factor(linreg.coef_[0, 0])
    message += f"Increase factor per day: {factor:.3f}\n"

    # In how many days does the number double?
    message += f"Number doubles every {np.log(2) / np.log(factor):.1f} days, multiplies by 10 every {np.log(10) / np.log(factor):.1f}.\n"

    # Predict
    X_pred = pd.date_range(start_date, end=datetime.today() + look_ahead,
                           freq='d')
    y_pred = np.exp(linreg.predict(date2int(X_pred)))

    # Compute confidence intervals
    lower, upper = compute_confidence_intervals(y_pred, X_pred, df,
                                                confidence=confidence)

    # Print
    fstr = "%a %d, %b"
    message += "```\n"
    message += "           Predict.      Diff.   Measur.    Diff.\n"
    message += "------------------------------------------------\n"
    y_prev = np.nan
    for day_curr, y, l, u in zip(X_pred, y_pred[:, 0], lower, upper):
        if day_curr.weekday() == 0:
            message += "\n"
        try:
            measurement = df.loc[day_curr.date()].values[0]
        except KeyError:
            measurement = np.nan

        # Difference to day before
        try:
            diff_measurement = measurement - df.loc[
                day_curr.date() - timedelta(days=1)].values[0]
        except KeyError:
            diff_measurement = np.nan

        diff_y = y - y_prev
        y_prev = y

        message += f"{day_curr.to_pydatetime().strftime(fstr)} {y:7.0f} ({diff_y:7.0f}+) {measurement:7.0f} ({diff_measurement:7.0f}+)"

        message += "\n"
    message += "```"

    # Plot
    plt.figure(figsize=(16, 16))
    for i, log in enumerate([True, False]):
        plt.subplot(2, 1, i + 1)
        plt.plot(df, label="Measurements", marker="|")
        plt.plot(X_pred, y_pred, label="Prediction", linestyle="--")
        plt.fill_between(X_pred, upper, y2=lower, alpha=0.2,
                         label=f"{confidence}% confidence interval")
        plt.axvline(x=dt(datetime.today().date()), color="grey", linestyle="--",
                    label="Today")
        plt.xlim([dt(start_date), X_pred[-1]])
        plt.ylim([y_pred[0], y_pred[-1]])
        plt.ylabel("Deaths" if deaths else "Cases")
        plt.gca().xaxis.set_minor_locator(mdates.DayLocator())
        plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(byweekday=mdates.MO))
        if log:
            plt.yscale("log")
        plt.legend(loc=2)
        plt.suptitle(f"Increase factor per day: {factor:.3f}")

    if post:
        # Post to Slack
        plt.savefig("corona_regression.png")
        with open(os.environ["SLACK_TOKEN"], "r") as f:
            slack_token = f.read().strip()
        client = slack.WebClient(token=slack_token)
        response = client.chat_postMessage(
            channel='#corona',
            text=message)
        response = client.files_upload(
            channels='#corona',
            file="corona_regression.png")
    else:
        print(message)
        plt.show()
Beispiel #9
0
def insert_at(df, sample_date, values):
    """ Insert some values into a bi-temporal dataframe.
        This is like what would happen when we get a price correction.
    """
    observed_dt = dt(datetime.now())
    return multi_index_insert_row(df, [sample_date, observed_dt], values)
Beispiel #10
0
def get_datetime_index_test_data():
    sample_dates = pd.DatetimeIndex(4 * [dt('1/1/2014 21:30')] +
                                    4 * [dt('2/1/2014 21:30')] +
                                    4 * [dt('3/1/2014 21:30')])
    observed_dates = [dt('1/1/2014 22:00'), dt('1/1/2014 22:30'), dt('2/1/2014 00:00'), dt('1/1/2015 21:30'),
                      dt('2/1/2014 23:00'), dt('2/1/2014 23:30'), dt('3/1/2014 00:00'), dt('2/1/2015 21:30'),
                      dt('3/1/2014 21:30'), dt('3/1/2014 22:30'), dt('4/1/2014 00:00'), dt('3/1/2015 21:30'),
                      ]
    index = pd.MultiIndex.from_arrays([sample_dates, observed_dates], names=['sample_dt', 'observed_dt'])

    prices = np.arange(24).reshape(12, 2) * 10
    df = pd.DataFrame(prices, index=index, columns=['OPEN', 'CLOSE'])

    #                                          OPEN  CLOSE
    # sample_dt           observed_dt                     
    # 2014-01-01 21:30:00 2014-01-01 22:00:00     0     10
    #                     2014-01-01 22:30:00    20     30
    #                     2014-02-01 00:00:00    40     50
    #                     2015-01-01 21:30:00    60     70
    # 2014-02-01 21:30:00 2014-02-01 23:00:00    80     90
    #                     2014-02-01 23:30:00   100    110
    #                     2014-03-01 00:00:00   120    130
    #                     2015-02-01 21:30:00   140    150
    # 2014-03-01 21:30:00 2014-03-01 21:30:00   160    170
    #                     2014-03-01 22:30:00   180    190
    #                     2014-04-01 00:00:00   200    210
    #                     2015-03-01 21:30:00   220    230
    return df
Beispiel #11
0
def insert_at(df, sample_date, values):
    """ Insert some values into a bi-temporal dataframe.
        This is like what would happen when we get a price correction.
    """
    observed_dt = dt(datetime.now())
    return multi_index_insert_row(df, [sample_date, observed_dt], values)
Beispiel #12
0
def get_datetime_index_test_data():
    sample_dates = pd.DatetimeIndex(4 * [dt('1/1/2014 21:30')] +
                                    4 * [dt('2/1/2014 21:30')] +
                                    4 * [dt('3/1/2014 21:30')])
    observed_dates = [
        dt('1/1/2014 22:00'),
        dt('1/1/2014 22:30'),
        dt('2/1/2014 00:00'),
        dt('1/1/2015 21:30'),
        dt('2/1/2014 23:00'),
        dt('2/1/2014 23:30'),
        dt('3/1/2014 00:00'),
        dt('2/1/2015 21:30'),
        dt('3/1/2014 21:30'),
        dt('3/1/2014 22:30'),
        dt('4/1/2014 00:00'),
        dt('3/1/2015 21:30'),
    ]
    index = pd.MultiIndex.from_arrays([sample_dates, observed_dates],
                                      names=['sample_dt', 'observed_dt'])

    prices = np.arange(24).reshape(12, 2) * 10
    df = pd.DataFrame(prices, index=index, columns=['OPEN', 'CLOSE'])

    #                                          OPEN  CLOSE
    # sample_dt           observed_dt
    # 2014-01-01 21:30:00 2014-01-01 22:00:00     0     10
    #                     2014-01-01 22:30:00    20     30
    #                     2014-02-01 00:00:00    40     50
    #                     2015-01-01 21:30:00    60     70
    # 2014-02-01 21:30:00 2014-02-01 23:00:00    80     90
    #                     2014-02-01 23:30:00   100    110
    #                     2014-03-01 00:00:00   120    130
    #                     2015-02-01 21:30:00   140    150
    # 2014-03-01 21:30:00 2014-03-01 21:30:00   160    170
    #                     2014-03-01 22:30:00   180    190
    #                     2014-04-01 00:00:00   200    210
    #                     2015-03-01 21:30:00   220    230
    return df
Beispiel #13
0
        params[param] = False

# read simulation settings
settings_df = pd.read_excel(io=parameter_file,
                            sheet_name='settings',
                            header=0,
                            index_col=0)
settings = settings_df.to_dict('dict')['value']

# read calibration data
calib = pd.read_excel(os.path.join(folder_input, 'adoption_rates.xlsx'),
                      header=2)
calib = calib.iloc[:, :]
calib_date = calib.date.to_list()
calib_date = [
    dt(calib_date[i], format='%Y/%m/%d').date() for i in range(len(calib_date))
]
calib_groups = {
    'gas': ['nat_gas_cb', 'nat_gas_cbs', 'nat_gas_lt'],
    'oil': ['oil_cb', 'oil_cbs', 'oil_lt'],
    'biomass': ['pellet_b', 'pellet_bs'],
    'heat_pump':
    ['heat_pump_gw', 'heat_pump_air', 'heat_pump_vc', 'heat_pump_hc'],
    'district_heating': ['district_heating']
}

group_color = {
    'gas': '0',
    'oil': '1',
    'biomass': '2',
    'heat_pump': '3',
Beispiel #14
0
import matplotlib.pyplot as plt
from pandas import DataFrame as dt
dt([1, 2, 4, 6]).plot()
plt.show()
Beispiel #15
0
def read_param_ev(file):
    energy_p_ev_df = pd.read_excel(io=file,
                                   sheet_name='energy_price_evolution',
                                   header=2,
                                   index_col=0)
    emi_ev_df = pd.read_excel(io=file,
                              sheet_name='emissions_evolution',
                              header=2,
                              index_col=0)
    emi_ev_type = pd.read_excel(io=file,
                                sheet_name='emissions_evolution',
                                usecols='B',
                                header=0,
                                nrows=1).iloc[0, 0]
    energy_p_ev_type = pd.read_excel(io=file,
                                     sheet_name='energy_price_evolution',
                                     usecols='B',
                                     header=0,
                                     nrows=1).iloc[0, 0]
    sub_scd_df = pd.read_excel(io=file,
                               sheet_name='subsidies_schedule',
                               header=2)
    ref_rate_ev_df = pd.read_excel(io=file,
                                   sheet_name='refurb_rate_evolution',
                                   header=2,
                                   index_col=0)

    # convert date input from string to datetime format
    sub_scd_df['start'] = dt(sub_scd_df['start'], format='%d/%m/%Y').dt.date
    sub_scd_df['end'] = dt(sub_scd_df['end'], format='%d/%m/%Y').dt.date
    emi_ev_df.index = dt(emi_ev_df.index, format='%d/%m/%Y').date
    energy_p_ev_df.index = dt(energy_p_ev_df.index, format='%d/%m/%Y').date
    ref_rate_ev_df.index = dt(ref_rate_ev_df.index, format='%d/%m/%y').date

    energy_p_ev_dict = energy_p_ev_df.to_dict('index')
    energy_p_ev_dict = dict(sorted(energy_p_ev_dict.items()))
    emi_ev_dict = emi_ev_df.to_dict('index')
    emi_ev_dict = dict(sorted(emi_ev_dict.items()))
    ref_rate_ev_dict = ref_rate_ev_df.to_dict('dict')['rate']

    # convert subsidies schedule
    # unique start and end dates
    unique_start = sub_scd_df.start.unique()
    unique_end = sub_scd_df.end.unique()

    dict_start = {
        date: list(sub_scd_df.loc[sub_scd_df['start'] == date].program)
        for date in unique_start
    }
    dict_start = dict(sorted(dict_start.items()))
    dict_end = {
        date: list(sub_scd_df.loc[sub_scd_df['end'] == date].program)
        for date in unique_end
    }
    dict_end = dict(sorted(dict_end.items()))
    sub_scd_dict = {'start': dict_start, 'end': dict_end}

    env_change_type = {
        'energy_price': energy_p_ev_type,
        'emissions': emi_ev_type
    }

    return [
        energy_p_ev_dict, emi_ev_dict, env_change_type, sub_scd_dict,
        ref_rate_ev_dict
    ]