Esempio n. 1
0
    def _clean_results(self, results):

        # --- Read list of dicts into DataFrame ---
        results = pd.DataFrame(results)

        # --- Make long ---
        id_vars = [self.id_col, self.date_col, self.geo_col, 'buffer']
        long_results = pd.melt(results, id_vars)

        # --- Make small ---
        long_results = helper.memory_downcaster(long_results)

        return long_results
Esempio n. 2
0
    def _clean_eightsixty(self):
        keep = [
            'plant_id_eia',
            'report_year',
            'operational_status',
            'capacity_mw',
            'summer_capacity_mw',
            'winter_capacity_mw',
            'fuel_type_code_pudl',
            'multiple_fuels',
            'planned_retirement_year',
            'minimum_load_mw',
        ]

        agg_dict = {
            'capacity_mw': 'sum',
            'summer_capacity_mw': 'sum',
            'winter_capacity_mw': 'sum',
            'minimum_load_mw': 'sum',
            'fuel_type_code_pudl': 'first',
            'multiple_fuels': 'max',
            'planned_retirement_year': 'max',
        }

        # --- convert to datetime ---
        self.eightsixty['report_date'] = pd.to_datetime(
            self.eightsixty['report_date'])
        self.eightsixty['planned_retirement_date'] = pd.to_datetime(
            self.eightsixty['planned_retirement_date'])
        self.eightsixty['report_year'] = [
            i.year for i in self.eightsixty['report_date']
        ]
        self.eightsixty['planned_retirement_year'] = [
            i.year for i in self.eightsixty['planned_retirement_date']
        ]

        # --- only take input year ---
        self.eightsixty = self.eightsixty.loc[
            self.eightsixty['report_year'].isin(self.years)]
        log.info(
            f"........filtering to report years: len {len(self.eightsixty)}")

        # --- take out possible retirements within next two years ---
        self.eightsixty['planned_retirement_year'].fillna(
            2099,
            inplace=True)  #fill in nans for plants with no planned retirement
        self.eightsixty = self.eightsixty.loc[
            self.eightsixty['planned_retirement_year'] >
            self.eightsixty['report_year'] + 2]
        log.info(
            f"........filtering out retirements in next year: len {len(self.eightsixty)}"
        )

        # --- only take operational assets ---
        self.eightsixty = self.eightsixty.loc[
            self.eightsixty['operational_status'] == 'existing']
        log.info(
            f"........filtering out non-operational assets: len {len(self.eightsixty)}"
        )

        # --- only take fossil generators ---
        self.eightsixty = self.eightsixty.loc[
            self.eightsixty['fuel_type_code_pudl'].isin(['coal', 'gas',
                                                         'oil'])]
        log.info(
            f"........filtering out non-fossil generators: len {len(self.eightsixty)}"
        )

        # --- filter out columns ---
        self.eightsixty = self.eightsixty[keep]

        # --- groupby to reduce multiple generators at one plant ---
        self.eightsixty = self.eightsixty.groupby(
            ['plant_id_eia', 'report_year'], as_index=False).agg(agg_dict)
        log.info(
            f"........reducing generators to plant level: len {len(self.eightsixty)}"
        )

        # --- make small ---
        self.eightsixty = helper.memory_downcaster(self.eightsixty)
        return self
Esempio n. 3
0
 def _load_plants_entity_eia(self):
     log.info('....loading plant level data')
     self.plants = pd.read_sql_query("SELECT * FROM plants_entity_eia",
                                     self.engine)
     self.plants = helper.memory_downcaster(self.plants)
     return self
Esempio n. 4
0
 def _load_generators_eia860(self):
     log.info('....loading generator eightsixty data')
     self.eightsixty = pd.read_sql_query("SELECT * FROM generators_eia860",
                                         self.engine)
     self.eightsixty = helper.memory_downcaster(self.eightsixty)
     return self
Esempio n. 5
0
    def _clean_cems(self):
        log.info(f"........postprocessing CEMS, len: {len(self.cems)}")
        rename_dict = {
            'STATE': 'state',
            'ORISPL_CODE': 'plant_id_eia',
            'UNIT_ID': 'unit',
            'OP_DATE': 'date',
            'OP_HOUR': 'hour',
            'OP_TIME': 'operational_time',
            'GLOAD (MW)': 'gross_load_mw',
            'SO2_MASS (lbs)': 'so2_lbs',
            'NOX_MASS (lbs)': 'nox_lbs',
            'CO2_MASS (tons)': 'co2_tons',
        }

        # ---Only take observed CEMS ---
        if len(self.measurement_flags) > 0:
            for c in ['NOX']:  # CO2 and SO2 are all calculated or nans
                log.info(
                    f'....removing {c} that are not in {self.measurement_flags}'
                )
                self.cems = self.cems[self.cems[f"{c}_RATE_MEASURE_FLG"].isin(
                    self.measurement_flags)]
                log.info(f'........len after removing: {len(self.cems)}')

        # --- Rename columns ---
        self.cems.rename(rename_dict, axis='columns', inplace=True)

        # --- drop nans ---
        log.info(f"....dropping observations with nans")
        self.cems.dropna(subset=['so2_lbs', 'nox_lbs', 'co2_tons'],
                         inplace=True)
        log.info(f"........len after drop: {len(self.cems)}")

        # --- Convert to datetime ---
        if self.ts_frequency == 'H':
            self.cems['hour'] = [str(i) + ':00:00' for i in self.cems['hour']]
            self.cems['datetime_utc'] = pd.to_datetime(self.cems['date'] +
                                                       ' ' + self.cems['hour'])
        else:
            self.cems['datetime_utc'] = pd.to_datetime(self.cems['date'])

        # --- drop plants without 24 entries in a date ---
        log.info(
            f'....dropping observations without a full 24 hours of data, len before: {len(self.cems)}'
        )
        self.cems['count'] = self.cems.groupby(
            ['date', 'plant_id_eia',
             'unit'])['plant_id_eia'].transform(lambda x: x.count())
        self.cems = self.cems.loc[self.cems['count'] == 24]
        self.cems.drop(['count', 'date'], axis='columns', inplace=True)
        log.info(f'........len after drop: {len(self.cems)}')

        # --- Aggregate by unit ---
        agg_dict = {
            'gross_load_mw': 'sum',
            'so2_lbs': 'sum',
            'nox_lbs': 'sum',
            'co2_tons': 'sum',
            'operational_time': 'mean',
        }
        self.cems = self.cems.groupby(['plant_id_eia', 'datetime_utc'],
                                      as_index=False).agg(agg_dict)

        # --- Aggregate by ts_frequency ---
        self.cems = self.cems.groupby('plant_id_eia').resample(
            self.ts_frequency,
            on='datetime_utc').sum()  #TODO: check how resampling works
        self.cems.drop(['plant_id_eia'],
                       axis='columns',
                       inplace=True,
                       errors='ignore')  #duplicated by resample
        self.cems.reset_index(inplace=True, drop=False)

        # --- Drop rows with no gross load (causing division by 0 error) ---
        self.cems = self.cems.loc[self.cems['gross_load_mw'] > 0]

        # --- Drop unnecessary columns ---
        keep = [
            'datetime_utc', 'plant_id_eia', 'gross_load_mw', 'so2_lbs',
            'nox_lbs', 'co2_tons', 'operational_time'
        ]
        self.cems = self.cems[keep]

        # --- convert co2 from tons to lbs ---
        self.cems['co2_lbs'] = self.cems['co2_tons'] * 2000
        self.cems = self.cems.drop(['co2_tons'], axis='columns')

        # --- reduce size ---
        self.cems = helper.memory_downcaster(self.cems)

        # --- reset index ---
        self.cems.reset_index(drop=True, inplace=True)

        return self