def store_input( self, filepath_or_buffer, df_input=None, src_spec=None, dest_spec=None, file_extension=None, ): """For usage capturing input data for unit tests.""" if not df_input: df_input = self.get_full_input() if not src_spec: src_spec = self.internal_spec if not dest_spec: dest_spec = self.destination.data_spec if not file_extension: file_extension = self.destination.file_extension _df = convert_spec(df=df_input, src_spec=src_spec, dest_spec=dest_spec, copy=True) self.destination.write_data_by_extension( _df, filepath_or_buffer, data_spec=dest_spec, file_extension=file_extension, )
def test_put_data(self): sim_name = self.get_sim_name() _df = self.data_client.get_full_input() self.data_client.destination.put_data(_df, sim_name, src_spec=Internal()) _gcs_uri = self.data_client.destination.get_gcs_uri(sim_name) r_df = pd.read_parquet(_gcs_uri) cr_df = convert_spec( r_df, src_spec=self.data_client.destination.data_spec, dest_spec=Internal(), src_nullable=True, dest_nullable=False, ) # remove states not in dest spec for _col in _df.columns: _state = [ v["internal_state"] for k, v in self.data_client.destination.data_spec.full.spec.items() if v["internal_state"] == _col ] if not _state: _df = _df.drop(columns=[_col]) pd.testing.assert_frame_equal(_df, cr_df)
def test_put_data(self): sim_name = self.get_sim_name() _df = self.data_client.get_full_input() self.data_client.destination.put_data(_df, sim_name, src_spec=Internal()) _fpath = os.path.join( self.data_client.destination.local_cache, self.data_client.destination.operator_name, sim_name + "." + self.data_client.destination.file_extension, ) r_df = pd.read_parquet(_fpath) cr_df = convert_spec( r_df, src_spec=self.data_client.destination.data_spec, dest_spec=Internal(), src_nullable=True, dest_nullable=False, ) # remove states not in dest spec for _col in _df.columns: _state = [ v["internal_state"] for k, v in self.data_client.destination.data_spec.full.spec.items() if v["internal_state"] == _col ] if not _state: _df = _df.drop(columns=[_col]) pd.testing.assert_frame_equal(_df, cr_df)
def put_data(self, df, sim_name, src_spec): _df = convert_spec(df=df, src_spec=src_spec, dest_spec=self.data_spec, copy=True) local_cache_file = self.get_local_cache_file(sim_name) self.put_local_cache(_df, local_cache_file)
def get_data(self, sim_config): """Get local cache""" local_cache_file = self.get_local_cache_file( identifier=sim_config["identifier"] ) _data = self.get_local_cache(local_cache_file) _data = self.drop_unused_columns(_data=_data) _data = convert_spec( df=_data, src_spec=self.data_spec, dest_spec=Internal(), copy=False ) return _data
def get_data(self, sim_config): # first check if file in local cache local_cache_file = self.get_local_cache_file( identifier=sim_config["identifier"]) _data = self.get_local_cache(local_cache_file) if _data.empty: _data = self.get_gcs_cache(sim_config, local_cache_file) _data = self.drop_unused_columns(_data=_data) _data = convert_spec(df=_data, src_spec=self.data_spec, dest_spec=Internal()) return _data
def test_put_data(self): sim_name = self.get_sim_name() _df = self.data_client.get_full_input() self.data_client.destination.put_data( _df, sim_name, src_spec=Internal() ) _gcs_uri = self.data_client.destination.get_gcs_uri(sim_name) r_df = pd.read_parquet(_gcs_uri) cr_df = convert_spec( r_df, src_spec=self.data_client.destination.data_spec, dest_spec=Internal(), ) assert _df.equals(cr_df)
def test_put_data(self): sim_name = self.get_sim_name() _df = self.data_client.get_full_input() self.data_client.destination.put_data(_df, sim_name, src_spec=Internal()) _fpath = os.path.join( self.data_client.destination.local_cache, self.data_client.destination.operator_name, sim_name + "." + self.data_client.destination.file_extension, ) r_df = pd.read_parquet(_fpath) cr_df = convert_spec( r_df, src_spec=self.data_client.destination.data_spec, dest_spec=Internal(), ) assert _df.equals(cr_df)
def get_data(self): # check for invalid start/end combination if self.sim_config["end_utc"] <= self.sim_config["start_utc"]: raise ValueError( "sim_config contains invalid start_utc >= end_utc.") # load from cache or download data from source _data = self.source.get_data(self.sim_config) if _data.empty: logger.error( "EMPTY DATA SOURCE: \nsim_config={} \nsource={}\n".format( self.sim_config, self.source)) _data = self.internal_spec.get_empty_df() # remove any fully duplicated records _data = _data.drop_duplicates(ignore_index=True) # remove multiple records for same datetime # there may also be multiple entries for same exact datetime in ISM # in this case keep the record that has the most combined runtime # because in observed cases of this the extra record has 0 runtime. _runtime_sum_column = "sum_runtime" _data[_runtime_sum_column] = _data[ set(self.internal_spec.equipment.spec.keys()) & set(_data.columns)].sum(axis=1) # last duplicate datetime value will have maximum sum_runtime _data = _data.sort_values( [self.internal_spec.datetime_column, _runtime_sum_column], ascending=True, ) _data = _data.drop_duplicates(subset=[STATES.DATE_TIME], keep="last", ignore_index=True) _data = _data.drop(columns=[_runtime_sum_column]) # truncate the data to desired simulation start and end time _data = _data[(_data[self.internal_spec.datetime_column] >= self.sim_config["start_utc"]) & (_data[self.internal_spec.datetime_column] <= self.sim_config["end_utc"])].reset_index(drop=True) # remove unused categories from categorical columns after date range # for simulation is selected for _cat_col in [ _col for _col in _data.columns if isinstance(_data[_col].dtype, pd.api.types.CategoricalDtype) ]: _data[_cat_col].cat = _data[_cat_col].cat.remove_unused_categories( ) # run settings change point detection before filling missing data # the fill data would create false positive change points # the change points can also be used to correctly fill the schedule # and comfort preferences ( _change_points_schedule, _change_points_comfort_prefs, _change_points_hvac_mode, ) = ThermostatChannel.get_settings_change_points( _data, self.internal_spec.data_period_seconds) _expected_period = f"{self.internal_spec.data_period_seconds}S" # ffill first 15 minutes of missing data periods _data = DataClient.fill_missing_data( full_data=_data, expected_period=_expected_period, data_spec=self.internal_spec, ) # compute full_data_periods with only first 15 minutes ffilled self.full_data_periods = DataClient.get_full_data_periods( full_data=_data, data_spec=self.internal_spec, expected_period=_expected_period, min_sim_period=self.sim_config["min_sim_period"], ) # need time zone before init of DatetimeChannel internal_timezone = DateTimeChannel.get_timezone( self.sim_config["latitude"], self.sim_config["longitude"]) # there will be filled data even if there are no full_data_periods # the fill data is present to run continuous simulations smoothly # in the presence of potentially many missing data periods if self.full_data_periods: # the simulation period must be full days starting at 0 hour to use # SimulationControl: Run Simulation for Weather File Run Periods _start_utc, _end_utc = self.get_simulation_period( expected_period=_expected_period, internal_timezone=internal_timezone, ) # add records for warmup period _data = DataClient.add_fill_records( df=_data, data_spec=self.internal_spec, start_utc=_start_utc, end_utc=_end_utc, expected_period=_expected_period, ) # drop records before and after full simulation time # end is less than _data = _data[ (_data[self.internal_spec.datetime_column] >= _start_utc) & (_data[self.internal_spec.datetime_column] <= _end_utc )].reset_index(drop=True) # bfill to interpolate missing data # first and last records must be full because we used full data periods # need to add a NA_code to stop fillna from clobbering columns # where NA means something na_code_name = "NA_code" _data[STATES.CALENDAR_EVENT].cat.add_categories( new_categories=na_code_name, inplace=True) _data[STATES.CALENDAR_EVENT] = _data[STATES.CALENDAR_EVENT].fillna( na_code_name) # bfill then ffill to handle where no data after null _data = _data.fillna(method="bfill", limit=None) _data = _data.fillna(method="ffill", limit=None) _data = DataClient.resample_to_step_size( df=_data, step_size_seconds=self.sim_config["sim_step_size_seconds"], data_spec=self.internal_spec, ) # we can replace na_code_name now that filling is complete _data.loc[_data[STATES.CALENDAR_EVENT] == na_code_name, [STATES.CALENDAR_EVENT], ] = pd.NA # finally convert dtypes to final types now that nulls in # non-nullable columns have been properly filled or removed _data = convert_spec(_data, src_spec=self.internal_spec, dest_spec=self.internal_spec, src_nullable=True, dest_nullable=False) else: raise ValueError( f"ID={self.sim_config['identifier']} has no full_data_periods " + "for requested duration: " + f"start_utc={self.sim_config['start_utc']}, " + f"end_utc={self.sim_config['end_utc']} " + f"with min_sim_period={self.sim_config['min_sim_period']}") self.datetime = DateTimeChannel( data=_data[self.internal_spec.intersect_columns( _data.columns, self.internal_spec.datetime.spec)], spec=self.internal_spec.datetime, latitude=self.sim_config["latitude"], longitude=self.sim_config["longitude"], internal_timezone=internal_timezone, ) # finally create the data channel objs for usage during simulation self.thermostat = ThermostatChannel( data=_data[self.internal_spec.intersect_columns( _data.columns, self.internal_spec.thermostat.spec)], spec=self.internal_spec.thermostat, change_points_schedule=_change_points_schedule, change_points_comfort_prefs=_change_points_comfort_prefs, change_points_hvac_mode=_change_points_hvac_mode, ) self.equipment = EquipmentChannel( data=_data[self.internal_spec.intersect_columns( _data.columns, self.internal_spec.equipment.spec)], spec=self.internal_spec.equipment, ) self.sensors = SensorsChannel( data=_data[self.internal_spec.intersect_columns( _data.columns, self.internal_spec.sensors.spec)], spec=self.internal_spec.sensors, ) self.sensors.drop_unused_room_sensors() self.weather = WeatherChannel( data=_data[self.internal_spec.intersect_columns( _data.columns, self.internal_spec.weather.spec)], spec=self.internal_spec.weather, archive_tmy3_dir=self.archive_tmy3_dir, archive_tmy3_data_dir=self.archive_tmy3_data_dir, ep_tmy3_cache_dir=self.ep_tmy3_cache_dir, simulation_epw_dir=self.simulation_epw_dir, )