def _clean_results(self, results): # --- Read list of dicts into DataFrame --- results = pd.DataFrame(results) # --- Make long --- id_vars = [self.id_col, self.date_col, self.geo_col, 'buffer'] long_results = pd.melt(results, id_vars) # --- Make small --- long_results = helper.memory_downcaster(long_results) return long_results
def _clean_eightsixty(self): keep = [ 'plant_id_eia', 'report_year', 'operational_status', 'capacity_mw', 'summer_capacity_mw', 'winter_capacity_mw', 'fuel_type_code_pudl', 'multiple_fuels', 'planned_retirement_year', 'minimum_load_mw', ] agg_dict = { 'capacity_mw': 'sum', 'summer_capacity_mw': 'sum', 'winter_capacity_mw': 'sum', 'minimum_load_mw': 'sum', 'fuel_type_code_pudl': 'first', 'multiple_fuels': 'max', 'planned_retirement_year': 'max', } # --- convert to datetime --- self.eightsixty['report_date'] = pd.to_datetime( self.eightsixty['report_date']) self.eightsixty['planned_retirement_date'] = pd.to_datetime( self.eightsixty['planned_retirement_date']) self.eightsixty['report_year'] = [ i.year for i in self.eightsixty['report_date'] ] self.eightsixty['planned_retirement_year'] = [ i.year for i in self.eightsixty['planned_retirement_date'] ] # --- only take input year --- self.eightsixty = self.eightsixty.loc[ self.eightsixty['report_year'].isin(self.years)] log.info( f"........filtering to report years: len {len(self.eightsixty)}") # --- take out possible retirements within next two years --- self.eightsixty['planned_retirement_year'].fillna( 2099, inplace=True) #fill in nans for plants with no planned retirement self.eightsixty = self.eightsixty.loc[ self.eightsixty['planned_retirement_year'] > self.eightsixty['report_year'] + 2] log.info( f"........filtering out retirements in next year: len {len(self.eightsixty)}" ) # --- only take operational assets --- self.eightsixty = self.eightsixty.loc[ self.eightsixty['operational_status'] == 'existing'] log.info( f"........filtering out non-operational assets: len {len(self.eightsixty)}" ) # --- only take fossil generators --- self.eightsixty = self.eightsixty.loc[ self.eightsixty['fuel_type_code_pudl'].isin(['coal', 'gas', 'oil'])] log.info( f"........filtering out non-fossil generators: len {len(self.eightsixty)}" ) # --- filter out columns --- self.eightsixty = self.eightsixty[keep] # --- groupby to reduce multiple generators at one plant --- self.eightsixty = self.eightsixty.groupby( ['plant_id_eia', 'report_year'], as_index=False).agg(agg_dict) log.info( f"........reducing generators to plant level: len {len(self.eightsixty)}" ) # --- make small --- self.eightsixty = helper.memory_downcaster(self.eightsixty) return self
def _load_plants_entity_eia(self): log.info('....loading plant level data') self.plants = pd.read_sql_query("SELECT * FROM plants_entity_eia", self.engine) self.plants = helper.memory_downcaster(self.plants) return self
def _load_generators_eia860(self): log.info('....loading generator eightsixty data') self.eightsixty = pd.read_sql_query("SELECT * FROM generators_eia860", self.engine) self.eightsixty = helper.memory_downcaster(self.eightsixty) return self
def _clean_cems(self): log.info(f"........postprocessing CEMS, len: {len(self.cems)}") rename_dict = { 'STATE': 'state', 'ORISPL_CODE': 'plant_id_eia', 'UNIT_ID': 'unit', 'OP_DATE': 'date', 'OP_HOUR': 'hour', 'OP_TIME': 'operational_time', 'GLOAD (MW)': 'gross_load_mw', 'SO2_MASS (lbs)': 'so2_lbs', 'NOX_MASS (lbs)': 'nox_lbs', 'CO2_MASS (tons)': 'co2_tons', } # ---Only take observed CEMS --- if len(self.measurement_flags) > 0: for c in ['NOX']: # CO2 and SO2 are all calculated or nans log.info( f'....removing {c} that are not in {self.measurement_flags}' ) self.cems = self.cems[self.cems[f"{c}_RATE_MEASURE_FLG"].isin( self.measurement_flags)] log.info(f'........len after removing: {len(self.cems)}') # --- Rename columns --- self.cems.rename(rename_dict, axis='columns', inplace=True) # --- drop nans --- log.info(f"....dropping observations with nans") self.cems.dropna(subset=['so2_lbs', 'nox_lbs', 'co2_tons'], inplace=True) log.info(f"........len after drop: {len(self.cems)}") # --- Convert to datetime --- if self.ts_frequency == 'H': self.cems['hour'] = [str(i) + ':00:00' for i in self.cems['hour']] self.cems['datetime_utc'] = pd.to_datetime(self.cems['date'] + ' ' + self.cems['hour']) else: self.cems['datetime_utc'] = pd.to_datetime(self.cems['date']) # --- drop plants without 24 entries in a date --- log.info( f'....dropping observations without a full 24 hours of data, len before: {len(self.cems)}' ) self.cems['count'] = self.cems.groupby( ['date', 'plant_id_eia', 'unit'])['plant_id_eia'].transform(lambda x: x.count()) self.cems = self.cems.loc[self.cems['count'] == 24] self.cems.drop(['count', 'date'], axis='columns', inplace=True) log.info(f'........len after drop: {len(self.cems)}') # --- Aggregate by unit --- agg_dict = { 'gross_load_mw': 'sum', 'so2_lbs': 'sum', 'nox_lbs': 'sum', 'co2_tons': 'sum', 'operational_time': 'mean', } self.cems = self.cems.groupby(['plant_id_eia', 'datetime_utc'], as_index=False).agg(agg_dict) # --- Aggregate by ts_frequency --- self.cems = self.cems.groupby('plant_id_eia').resample( self.ts_frequency, on='datetime_utc').sum() #TODO: check how resampling works self.cems.drop(['plant_id_eia'], axis='columns', inplace=True, errors='ignore') #duplicated by resample self.cems.reset_index(inplace=True, drop=False) # --- Drop rows with no gross load (causing division by 0 error) --- self.cems = self.cems.loc[self.cems['gross_load_mw'] > 0] # --- Drop unnecessary columns --- keep = [ 'datetime_utc', 'plant_id_eia', 'gross_load_mw', 'so2_lbs', 'nox_lbs', 'co2_tons', 'operational_time' ] self.cems = self.cems[keep] # --- convert co2 from tons to lbs --- self.cems['co2_lbs'] = self.cems['co2_tons'] * 2000 self.cems = self.cems.drop(['co2_tons'], axis='columns') # --- reduce size --- self.cems = helper.memory_downcaster(self.cems) # --- reset index --- self.cems.reset_index(drop=True, inplace=True) return self