def test__can_insert_row(): """ I can insert a new row into a bitemp ts and it comes back when selecting the latest data """ df = get_bitemporal_test_data() df = insert_at(df, dt('2014-01-03'), [[9, 90]]) assert len(df) == 9 df = groupby_asof(df) assert len(df) == 4 assert df.loc[dt('2014-01-03')]['OPEN'] == 9 assert df.loc[dt('2014-01-03')]['CLOSE'] == 90
def preprocess_years(self): df_day = [] df_month = [] df_year = [] for year in ['2009', '2010', '2011', '2012', '2013']: print('Preprocessing data in {}...'.format(year)) df_loaded = self.load_csv( join(cfg.source_path, 'z_hack_transaction_{}_new.csv'.format(year))) df_day.append(self.preprocess(df_loaded, 'day')) df_month.append(self.preprocess(df_loaded, 'month')) df_year.append(self.preprocess(df_loaded, 'year')) print('Concatenating data...') df_total_day: pd.DataFrame = pd.concat(df_day) df_total_month: pd.DataFrame = pd.concat(df_month) df_total_year: pd.DataFrame = pd.concat(df_year) print('Filling missing data...') start = dt(2009, 1, 4) end = dt(2013, 12, 31) df_day_work = df_total_day.reindex( pd.bdate_range(start, end).rename('DATE')) df_day_w_ff = self.fill_nan(df_day_work, 'ff') df_day_w_bf = self.fill_nan(df_day_work, 'bf') df_day_w_avg = self.fill_nan(df_day_work, 'avg') df_day_w_line = self.fill_nan(df_day_work, 'line') df_day_all = df_total_day.reindex( pd.date_range(start, end).rename('DATE')) df_day_a_ff = self.fill_nan(df_day_all, 'ff') df_day_a_bf = self.fill_nan(df_day_all, 'bf') df_day_a_avg = self.fill_nan(df_day_all, 'avg') df_day_a_line = self.fill_nan(df_day_all, 'line') print('Writing data to CSV...') self.save_to_csv(df_total_day, 'day') self.save_to_csv(df_total_month, 'month') self.save_to_csv(df_total_year, 'year') self.save_to_csv(df_day_work, 'day_work') self.save_to_csv(df_day_w_ff, 'day_w_ff') self.save_to_csv(df_day_w_bf, 'day_w_bf') self.save_to_csv(df_day_w_avg, 'day_w_avg') self.save_to_csv(df_day_w_line, 'day_w_line') self.save_to_csv(df_day_all, 'day_all') self.save_to_csv(df_day_a_ff, 'day_a_ff') self.save_to_csv(df_day_a_bf, 'day_a_bf') self.save_to_csv(df_day_a_avg, 'day_a_avg') self.save_to_csv(df_day_a_line, 'day_a_line')
def test__get_ts__asof_datetime(): """ I can get a timeseries as-of a particular point in time """ df = groupby_asof(get_bitemporal_test_data(), as_of=dt('2015-01-05')) assert len(df) == 3 assert all(df['OPEN'] == [1.1, 2.1, 3.0]) assert all(df['CLOSE'] == [10.1, 20.1, 30.0])
def read_ca(file_directory, model): central_agents_df = pd.read_excel(io=file_directory, sheet_name='central_agents', header=1) ca_scd_df = pd.read_excel(io=file_directory, sheet_name='central_agents_schedule', header=2) central_agents_df.replace('None', np.nan, True) ca_scd_df['start'] = dt(ca_scd_df['start'], format='%d/%m/%Y').dt.date ca_scd_df['end'] = dt(ca_scd_df['end'], format='%d/%m/%Y').dt.date unique_start = ca_scd_df.start.unique() unique_end = ca_scd_df.end.unique() dict_start = { date: list(ca_scd_df.loc[ca_scd_df['start'] == date].central_agent) for date in unique_start } dict_start = dict(sorted(dict_start.items())) dict_end = { date: list(ca_scd_df.loc[ca_scd_df['end'] == date].central_agent) for date in unique_end } dict_end = dict(sorted(dict_end.items())) central_agents_schedule_dict = {'start': dict_start, 'end': dict_end} dict_map = {True: 'active', False: 'passive'} central_agents = {} # {'active': [], 'passive': []} central_agents_passive = [] for index, row in central_agents_df.iterrows(): ca = CentralAgent(index, model) ca.active = central_agents_df.active[index] ca.name = central_agents_df.name[index] ca.range = central_agents_df.range[index] ca.opinion = row.tolist()[3:] central_agents[ca.name] = ca return central_agents, central_agents_schedule_dict
def download_and_read_data(country, deaths=False): # Download data url = f"https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_{'deaths' if deaths else 'confirmed'}_global.csv" df_file = "data.csv" urllib.request.urlretrieve(url, filename=df_file) # Read data df = pd.read_csv(df_file) not_time = ['Province/State', 'Country/Region', 'Lat', 'Long'] df_country = df[df["Country/Region"] == country].drop(not_time, axis=1).T df_country.index = dt(df_country.index) return df_country
def main(country, start_date, look_ahead, post, deaths): # Parameters confidence = 95 look_ahead = look_ahead * timedelta(days=1) df = download_and_read_data(country, deaths=deaths) message = "" message += f"Latest measurement from `{df.index[-1]}`\n" valid_indices = df.index >= dt(start_date) X = df[valid_indices].index X_int = date2int(X) y_log = np.log(df[valid_indices]) # Fit model linreg = sklearn.linear_model.LinearRegression() linreg.fit(X_int, y_log) r2 = linreg.score(X_int, y_log) message += f"Fitted linear regression with R^2={r2:.3f}\n" factor = coef2factor(linreg.coef_[0, 0]) message += f"Increase factor per day: {factor:.3f}\n" # In how many days does the number double? message += f"Number doubles every {np.log(2) / np.log(factor):.1f} days, multiplies by 10 every {np.log(10) / np.log(factor):.1f}.\n" # Predict X_pred = pd.date_range(start_date, end=datetime.today() + look_ahead, freq='d') y_pred = np.exp(linreg.predict(date2int(X_pred))) # Compute confidence intervals lower, upper = compute_confidence_intervals(y_pred, X_pred, df, confidence=confidence) # Print fstr = "%a %d, %b" message += "```\n" message += " Predict. Diff. Measur. Diff.\n" message += "------------------------------------------------\n" y_prev = np.nan for day_curr, y, l, u in zip(X_pred, y_pred[:, 0], lower, upper): if day_curr.weekday() == 0: message += "\n" try: measurement = df.loc[day_curr.date()].values[0] except KeyError: measurement = np.nan # Difference to day before try: diff_measurement = measurement - df.loc[ day_curr.date() - timedelta(days=1)].values[0] except KeyError: diff_measurement = np.nan diff_y = y - y_prev y_prev = y message += f"{day_curr.to_pydatetime().strftime(fstr)} {y:7.0f} ({diff_y:7.0f}+) {measurement:7.0f} ({diff_measurement:7.0f}+)" message += "\n" message += "```" # Plot plt.figure(figsize=(16, 16)) for i, log in enumerate([True, False]): plt.subplot(2, 1, i + 1) plt.plot(df, label="Measurements", marker="|") plt.plot(X_pred, y_pred, label="Prediction", linestyle="--") plt.fill_between(X_pred, upper, y2=lower, alpha=0.2, label=f"{confidence}% confidence interval") plt.axvline(x=dt(datetime.today().date()), color="grey", linestyle="--", label="Today") plt.xlim([dt(start_date), X_pred[-1]]) plt.ylim([y_pred[0], y_pred[-1]]) plt.ylabel("Deaths" if deaths else "Cases") plt.gca().xaxis.set_minor_locator(mdates.DayLocator()) plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(byweekday=mdates.MO)) if log: plt.yscale("log") plt.legend(loc=2) plt.suptitle(f"Increase factor per day: {factor:.3f}") if post: # Post to Slack plt.savefig("corona_regression.png") with open(os.environ["SLACK_TOKEN"], "r") as f: slack_token = f.read().strip() client = slack.WebClient(token=slack_token) response = client.chat_postMessage( channel='#corona', text=message) response = client.files_upload( channels='#corona', file="corona_regression.png") else: print(message) plt.show()
def insert_at(df, sample_date, values): """ Insert some values into a bi-temporal dataframe. This is like what would happen when we get a price correction. """ observed_dt = dt(datetime.now()) return multi_index_insert_row(df, [sample_date, observed_dt], values)
def get_datetime_index_test_data(): sample_dates = pd.DatetimeIndex(4 * [dt('1/1/2014 21:30')] + 4 * [dt('2/1/2014 21:30')] + 4 * [dt('3/1/2014 21:30')]) observed_dates = [dt('1/1/2014 22:00'), dt('1/1/2014 22:30'), dt('2/1/2014 00:00'), dt('1/1/2015 21:30'), dt('2/1/2014 23:00'), dt('2/1/2014 23:30'), dt('3/1/2014 00:00'), dt('2/1/2015 21:30'), dt('3/1/2014 21:30'), dt('3/1/2014 22:30'), dt('4/1/2014 00:00'), dt('3/1/2015 21:30'), ] index = pd.MultiIndex.from_arrays([sample_dates, observed_dates], names=['sample_dt', 'observed_dt']) prices = np.arange(24).reshape(12, 2) * 10 df = pd.DataFrame(prices, index=index, columns=['OPEN', 'CLOSE']) # OPEN CLOSE # sample_dt observed_dt # 2014-01-01 21:30:00 2014-01-01 22:00:00 0 10 # 2014-01-01 22:30:00 20 30 # 2014-02-01 00:00:00 40 50 # 2015-01-01 21:30:00 60 70 # 2014-02-01 21:30:00 2014-02-01 23:00:00 80 90 # 2014-02-01 23:30:00 100 110 # 2014-03-01 00:00:00 120 130 # 2015-02-01 21:30:00 140 150 # 2014-03-01 21:30:00 2014-03-01 21:30:00 160 170 # 2014-03-01 22:30:00 180 190 # 2014-04-01 00:00:00 200 210 # 2015-03-01 21:30:00 220 230 return df
def get_datetime_index_test_data(): sample_dates = pd.DatetimeIndex(4 * [dt('1/1/2014 21:30')] + 4 * [dt('2/1/2014 21:30')] + 4 * [dt('3/1/2014 21:30')]) observed_dates = [ dt('1/1/2014 22:00'), dt('1/1/2014 22:30'), dt('2/1/2014 00:00'), dt('1/1/2015 21:30'), dt('2/1/2014 23:00'), dt('2/1/2014 23:30'), dt('3/1/2014 00:00'), dt('2/1/2015 21:30'), dt('3/1/2014 21:30'), dt('3/1/2014 22:30'), dt('4/1/2014 00:00'), dt('3/1/2015 21:30'), ] index = pd.MultiIndex.from_arrays([sample_dates, observed_dates], names=['sample_dt', 'observed_dt']) prices = np.arange(24).reshape(12, 2) * 10 df = pd.DataFrame(prices, index=index, columns=['OPEN', 'CLOSE']) # OPEN CLOSE # sample_dt observed_dt # 2014-01-01 21:30:00 2014-01-01 22:00:00 0 10 # 2014-01-01 22:30:00 20 30 # 2014-02-01 00:00:00 40 50 # 2015-01-01 21:30:00 60 70 # 2014-02-01 21:30:00 2014-02-01 23:00:00 80 90 # 2014-02-01 23:30:00 100 110 # 2014-03-01 00:00:00 120 130 # 2015-02-01 21:30:00 140 150 # 2014-03-01 21:30:00 2014-03-01 21:30:00 160 170 # 2014-03-01 22:30:00 180 190 # 2014-04-01 00:00:00 200 210 # 2015-03-01 21:30:00 220 230 return df
params[param] = False # read simulation settings settings_df = pd.read_excel(io=parameter_file, sheet_name='settings', header=0, index_col=0) settings = settings_df.to_dict('dict')['value'] # read calibration data calib = pd.read_excel(os.path.join(folder_input, 'adoption_rates.xlsx'), header=2) calib = calib.iloc[:, :] calib_date = calib.date.to_list() calib_date = [ dt(calib_date[i], format='%Y/%m/%d').date() for i in range(len(calib_date)) ] calib_groups = { 'gas': ['nat_gas_cb', 'nat_gas_cbs', 'nat_gas_lt'], 'oil': ['oil_cb', 'oil_cbs', 'oil_lt'], 'biomass': ['pellet_b', 'pellet_bs'], 'heat_pump': ['heat_pump_gw', 'heat_pump_air', 'heat_pump_vc', 'heat_pump_hc'], 'district_heating': ['district_heating'] } group_color = { 'gas': '0', 'oil': '1', 'biomass': '2', 'heat_pump': '3',
import matplotlib.pyplot as plt from pandas import DataFrame as dt dt([1, 2, 4, 6]).plot() plt.show()
def read_param_ev(file): energy_p_ev_df = pd.read_excel(io=file, sheet_name='energy_price_evolution', header=2, index_col=0) emi_ev_df = pd.read_excel(io=file, sheet_name='emissions_evolution', header=2, index_col=0) emi_ev_type = pd.read_excel(io=file, sheet_name='emissions_evolution', usecols='B', header=0, nrows=1).iloc[0, 0] energy_p_ev_type = pd.read_excel(io=file, sheet_name='energy_price_evolution', usecols='B', header=0, nrows=1).iloc[0, 0] sub_scd_df = pd.read_excel(io=file, sheet_name='subsidies_schedule', header=2) ref_rate_ev_df = pd.read_excel(io=file, sheet_name='refurb_rate_evolution', header=2, index_col=0) # convert date input from string to datetime format sub_scd_df['start'] = dt(sub_scd_df['start'], format='%d/%m/%Y').dt.date sub_scd_df['end'] = dt(sub_scd_df['end'], format='%d/%m/%Y').dt.date emi_ev_df.index = dt(emi_ev_df.index, format='%d/%m/%Y').date energy_p_ev_df.index = dt(energy_p_ev_df.index, format='%d/%m/%Y').date ref_rate_ev_df.index = dt(ref_rate_ev_df.index, format='%d/%m/%y').date energy_p_ev_dict = energy_p_ev_df.to_dict('index') energy_p_ev_dict = dict(sorted(energy_p_ev_dict.items())) emi_ev_dict = emi_ev_df.to_dict('index') emi_ev_dict = dict(sorted(emi_ev_dict.items())) ref_rate_ev_dict = ref_rate_ev_df.to_dict('dict')['rate'] # convert subsidies schedule # unique start and end dates unique_start = sub_scd_df.start.unique() unique_end = sub_scd_df.end.unique() dict_start = { date: list(sub_scd_df.loc[sub_scd_df['start'] == date].program) for date in unique_start } dict_start = dict(sorted(dict_start.items())) dict_end = { date: list(sub_scd_df.loc[sub_scd_df['end'] == date].program) for date in unique_end } dict_end = dict(sorted(dict_end.items())) sub_scd_dict = {'start': dict_start, 'end': dict_end} env_change_type = { 'energy_price': energy_p_ev_type, 'emissions': emi_ev_type } return [ energy_p_ev_dict, emi_ev_dict, env_change_type, sub_scd_dict, ref_rate_ev_dict ]