Ejemplo n.º 1
0
def eia_facility_fuel_region(year):
    primary_fuel = eia923_primary_fuel(year=year)
    ba_match = eia860_balancing_authority(year)
    primary_fuel["Plant Id"] = primary_fuel["Plant Id"].astype(int)
    ba_match["Plant Id"] = ba_match["Plant Id"].astype(int)
    combined = primary_fuel.merge(ba_match, on='Plant Id')
    combined['primary fuel percent gen'] = (
        combined['primary fuel percent gen'] / 100)

    combined.rename(columns={
        'primary fuel percent gen':
        'PercentGenerationfromDesignatedFuelCategory',
        'Plant Id': 'FacilityID',
        'fuel category': 'FuelCategory',
        'NERC Region': 'NERC',
    },
                    inplace=True)

    return combined
Ejemplo n.º 2
0
def aggregate_data(total_db, subregion="BA"):
    """
    Aggregates facility-level emissions to the specified subregion and
    calculates emission factors based on the total emission and total
    electricity generation.

    Parameters
    ----------
    total_db : dataframe
        Facility-level emissions as generated by created by
        create_generation_process_df
    subregion : str, optional
        The level of subregion that the data will be aggregated to. Choices
        are 'all', 'NERC', 'BA', 'US', by default 'BA'.

    Returns
    -------
    dataframe
        The dataframe provides the emissions aggregated to the specified
        subregion for each technology and stage in the input total_db. This
        dataframe includes an average emission factor and, when applicable
        uncertainty distributions.
    """
    from electricitylci.aggregation_selector import subregion_col

    def geometric_mean(p_series, df, cols):
        # Alternatively we can use scipy.stats.lognorm to fit a distribution
        # and provide the parameters
        if (len(p_series) > 3) & (p_series.quantile(0.5) > 0):
            # result = gmean(p_series.to_numpy()+1)-1
            module_logger.debug(
                f"Calculating confidence interval for"
                f"{df.loc[p_series.index[0],groupby_cols].values}")
            module_logger.debug(f"{p_series.values}")
            with np.errstate(all='raise'):
                try:
                    data = p_series.to_numpy()
                except (ArithmeticError, ValueError, FloatingPointError):
                    module_logger.debug("Problem with input data")
                    return None
                try:
                    log_data = np.log(data)
                except (ArithmeticError, ValueError, FloatingPointError):
                    module_logger.debug("Problem with log function")
                    return None
                try:
                    mean = np.mean(log_data)
                except (ArithmeticError, ValueError, FloatingPointError):
                    module_logger.debug("Problem with mean function")
                    return None
                l = len(data)
                try:
                    sd = np.std(log_data) / np.sqrt(l)
                    sd2 = sd**2
                except (ArithmeticError, ValueError, FloatingPointError):
                    module_logger.debug("Problem with std function")
                    return None
                try:
                    pi1, pi2 = t.interval(alpha=0.90,
                                          df=l - 2,
                                          loc=mean,
                                          scale=sd)
                except (ArithmeticError, ValueError, FloatingPointError):
                    module_logger.debug("Problem with t function")
                    return None
                try:
                    upper_interval = np.max([
                        mean + sd2 / 2 + pi2 * np.sqrt(sd2 / l + sd2**2 /
                                                       (2 * (l - 1))),
                        mean + sd2 / 2 - pi2 * np.sqrt(sd2 / l + sd2**2 /
                                                       (2 * (l - 1))),
                    ])
                except:
                    module_logger.debug("Problem with interval function")
                    return None
                try:
                    result = (np.exp(mean), 0, np.exp(upper_interval))
                except (ArithmeticError, ValueError, FloatingPointError):
                    module_logger.debug("Unable to calculate geometric_mean")
                    return None
                if result is not None:
                    return result
                else:
                    module_logger.debug(
                        f"Problem generating uncertainty parameters \n"
                        f"{df.loc[p_series.index[0],groupby_cols].values}\n"
                        f"{p_series.values}"
                        f"{p_series.values+1}")
                    return None
        else:
            return None

    def calc_geom_std(df):
        if region_agg is not None:
            debug_string = f"{df[region_agg]}-{df['FuelCategory']}-{df['FlowName']}"
        else:
            debug_string = f"{df['FuelCategory']}-{df['FlowName']}"
        module_logger.debug(debug_string)
        if df["uncertaintyLognormParams"] is None:
            return None, None
        if isinstance(df["uncertaintyLognormParams"], str):
            params = ast.literal_eval(df["uncertaintyLognormParams"])
        try:
            length = len(df["uncertaintyLognormParams"])
        except TypeError:
            module_logger.info(
                f"Error calculating length of uncertaintyLognormParams"
                f"{df['uncertaintyLognormParams']}")
            return None, None

        if length != 3:
            module_logger.info(
                f"Error estimating standard deviation - length: {len(params)}")
        else:
            # In some cases, the final emission factor is far different than the
            # geometric mean of the individual emission factor. Depending on the
            # severity, this could be a clear sign of outliers having a large impact
            # on the final emission factor. When the uncertainty is generated for
            # these cases, the results can be nonsensical - hence we skip them. A more
            # agressive approach would be to re-assign the emission factor as well.
            if df["Emission_factor"] > df["uncertaintyLognormParams"][2]:
                return None, None
            else:
                c = np.log(df["uncertaintyLognormParams"][2]) - np.log(
                    df["Emission_factor"])
                b = -2**0.5 * erfinv(2 * 0.95 - 1)
                a = 0.5
                sd1 = (-b + (b**2 - 4 * a * c)**0.5) / (2 * a)
                sd2 = (-b - (b**2 - 4 * a * c)**0.5) / (2 * a)
                if sd1 is not float("nan") and sd2 is not float("nan"):
                    if sd1 < sd2:
                        geostd = np.exp(sd1)
                        geomean = np.exp(
                            np.log(df["Emission_factor"]) - 0.5 * sd1**2)
                    else:
                        geostd = np.exp(sd2)
                        geomean = np.exp(
                            np.log(df["Emission_factor"]) - 0.5 * sd2**2)
                elif sd1 is not float("nan"):
                    geostd = np.exp(sd1)
                    geomean = np.exp(
                        np.log(df["Emission_factor"]) - 0.5 * sd1**2)
                elif sd2 is not float("nan"):
                    geostd = np.exp(sd2)
                    geomean = np.exp(
                        np.log(df["Emission_factor"]) - 0.5 * sd2**2)
                else:
                    return None, None
                if ((geostd is np.inf) or (geostd is np.NINF)
                        or (geostd is np.nan) or (geostd is float("nan"))
                        or str(geostd) == "nan" or (geostd == 0)):
                    return None, None
                return str(geomean), str(geostd)

    region_agg = subregion_col(subregion)
    fuel_agg = ["FuelCategory"]
    if region_agg:
        groupby_cols = (
            region_agg + fuel_agg +
            ["stage_code", "FlowName", "Compartment", "FlowUUID", "Unit"])
        elec_df_groupby_cols = (region_agg + fuel_agg +
                                ["Year", "source_string"])
    else:
        groupby_cols = fuel_agg + [
            "stage_code", "FlowName", "Compartment", "FlowUUID", "Unit"
        ]
        elec_df_groupby_cols = fuel_agg + ["Year", "source_string"]
    if model_specs.replace_egrid:
        primary_fuel_df = eia923_primary_fuel(year=model_specs.eia_gen_year)
        primary_fuel_df.rename(columns={'Plant Id': "eGRID_ID"}, inplace=True)
        primary_fuel_df["eGRID_ID"] = primary_fuel_df["eGRID_ID"].astype(int)
        key_df = (primary_fuel_df[[
            "eGRID_ID", "FuelCategory"
        ]].dropna().drop_duplicates(subset="eGRID_ID").set_index("eGRID_ID"))
        total_db.loc[total_db["FuelCategory"] != "ALL",
                     "FuelCategory"] = total_db["eGRID_ID"].map(
                         key_df["FuelCategory"])
    total_db["FlowUUID"] = total_db["FlowUUID"].fillna(value="dummy-uuid")
    total_db = aggregate_facility_flows(total_db)
    total_db, electricity_df = calculate_electricity_by_source(
        total_db, subregion)
    total_db["FlowAmount"].replace(to_replace=0, value=1E-15, inplace=True)
    total_db = add_data_collection_score(total_db, electricity_df, subregion)
    total_db["facility_emission_factor"] = (total_db["FlowAmount"] /
                                            total_db["Electricity"])
    total_db.dropna(subset=["facility_emission_factor"], inplace=True)

    def wtd_mean(pdser, total_db, cols):
        try:
            wts = total_db.loc[pdser.index, "FlowAmount"]
            result = np.average(pdser, weights=wts)
        except:
            module_logger.debug(
                f"Error calculating weighted mean for {pdser.name}-"
                f"likely from 0 FlowAmounts"
                # f"{total_db.loc[pdser.index[0],cols]}"
            )
            try:
                with np.errstate(all='raise'):
                    result = np.average(pdser)
            except ArithmeticError or ValueError or FloatingPointError:
                result = float("nan")
        return result

    wm = lambda x: wtd_mean(x, total_db, groupby_cols)
    geo_mean = lambda x: geometric_mean(x, total_db, groupby_cols)
    geo_mean.__name__ = "geo_mean"
    print(
        "Aggregating flow amounts, dqi information, and calculating uncertainty"
    )

    database_f3 = total_db.groupby(groupby_cols + ["Year", "source_string"],
                                   as_index=False).agg({
                                       "FlowAmount": ["sum", "count"],
                                       "TemporalCorrelation":
                                       wm,
                                       "TechnologicalCorrelation":
                                       wm,
                                       "GeographicalCorrelation":
                                       wm,
                                       "DataCollection":
                                       wm,
                                       "ReliabilityScore":
                                       wm,
                                       "facility_emission_factor":
                                       ["min", "max", geo_mean],
                                   })
    database_f3.columns = groupby_cols + [
        "Year",
        "source_string",
        "FlowAmount",
        "FlowAmountCount",
        "TemporalCorrelation",
        "TechnologicalCorrelation",
        "GeographicalCorrelation",
        "DataCollection",
        "ReliabilityScore",
        "uncertaintyMin",
        "uncertaintyMax",
        "uncertaintyLognormParams",
    ]

    criteria = database_f3["Compartment"] == "input"
    database_f3.loc[criteria, "uncertaintyLognormParams"] = None
    database_f3 = database_f3.merge(
        right=electricity_df,
        left_on=elec_df_groupby_cols,
        right_on=elec_df_groupby_cols,
        how="left",
    )

    canadian_criteria = database_f3["FuelCategory"] == "ALL"
    if region_agg:
        canada_db = pd.merge(
            left=database_f3.loc[canadian_criteria, :],
            right=total_db[groupby_cols + ["Electricity"]],
            left_on=groupby_cols,
            right_on=groupby_cols,
            how="left",
        ).drop_duplicates(subset=groupby_cols)
    else:
        total_grouped = total_db.groupby(by=groupby_cols,
                                         as_index=False)["Electricity"].sum()
        canada_db = pd.merge(
            left=database_f3.loc[canadian_criteria, :],
            right=total_grouped,
            left_on=groupby_cols,
            right_on=groupby_cols,
            how="left",
        )
    canada_db.index = database_f3.loc[canadian_criteria, :].index
    database_f3.loc[database_f3["FlowUUID"] == "dummy-uuid",
                    "FlowUUID"] = float("nan")
    database_f3.loc[canada_db.index,
                    "electricity_sum"] = canada_db["Electricity"]
    database_f3["Emission_factor"] = (database_f3["FlowAmount"] /
                                      database_f3["electricity_sum"])
    # Infinite values generally coming from places with 0 generation. This happens
    # particularly with the Canadian mixes.
    database_f3["Emission_factor"].replace(to_replace=float("inf"),
                                           value=0,
                                           inplace=True)
    database_f3["Emission_factor"].replace(to_replace=float("-inf"),
                                           value=0,
                                           inplace=True)
    if region_agg is not None:
        database_f3["GeomMean"], database_f3["GeomSD"] = zip(*database_f3[[
            "Emission_factor", "uncertaintyLognormParams", "uncertaintyMin",
            "uncertaintyMax", "FuelCategory", "FlowName"
        ] + region_agg].apply(calc_geom_std, axis=1))
    else:
        database_f3["GeomMean"], database_f3["GeomSD"] = zip(*database_f3[[
            "Emission_factor", "uncertaintyLognormParams", "uncertaintyMin",
            "uncertaintyMax", "FuelCategory", "FlowName"
        ]].apply(calc_geom_std, axis=1))
    database_f3.sort_values(by=groupby_cols, inplace=True)
    return database_f3
Ejemplo n.º 3
0
def create_generation_process_df():
    """
    Reads emissions and generation data from different sources to provide
    facility-level emissions. Most important inputs to this process come
    from the model configuration file.

    Parameters
    ----------
    None

    Returns
    ----------
    dataframe
        Datafrane includes all facility-level emissions
    """
    from electricitylci.eia923_generation import (build_generation_data,
                                                  eia923_primary_fuel)
    from electricitylci.egrid_filter import (
        egrid_facilities_to_include,
        emissions_and_waste_for_selected_egrid_facilities,
    )
    from electricitylci.generation import (
        egrid_facilities_w_fuel_region,
        add_technological_correlation_score,
        add_temporal_correlation_score,
    )
    import electricitylci.emissions_other_sources as em_other
    import electricitylci.ampd_plant_emissions as ampd
    from electricitylci.combinator import ba_codes
    import electricitylci.manual_edits as edits

    COMPARTMENT_DICT = {
        "emission/air": "air",
        "emission/water": "water",
        "emission/ground": "ground",
        "input": "input",
        "output": "output",
        "waste": "waste",
        "air": "air",
        "water": "water",
        "ground": "ground",
    }
    if model_specs.replace_egrid:
        generation_data = build_generation_data().drop_duplicates()
        cems_df = ampd.generate_plant_emissions(model_specs.eia_gen_year)
        cems_df.drop(columns=["FlowUUID"], inplace=True)
        emissions_and_waste_for_selected_egrid_facilities = em_other.integrate_replace_emissions(
            cems_df, emissions_and_waste_for_selected_egrid_facilities)
    else:
        from electricitylci.egrid_filter import electricity_for_selected_egrid_facilities
        generation_data = electricity_for_selected_egrid_facilities
        generation_data["Year"] = model_specs.egrid_year
        generation_data["FacilityID"] = generation_data["FacilityID"].astype(
            int)


#        generation_data = build_generation_data(
#            egrid_facilities_to_include=egrid_facilities_to_include
#        )
    emissions_and_waste_for_selected_egrid_facilities.drop(
        columns=["FacilityID"])
    emissions_and_waste_for_selected_egrid_facilities[
        "eGRID_ID"] = emissions_and_waste_for_selected_egrid_facilities[
            "eGRID_ID"].astype(int)
    final_database = pd.merge(
        left=emissions_and_waste_for_selected_egrid_facilities,
        right=generation_data,
        right_on=["FacilityID", "Year"],
        left_on=["eGRID_ID", "Year"],
        how="left",
    )
    egrid_facilities_w_fuel_region[
        "FacilityID"] = egrid_facilities_w_fuel_region["FacilityID"].astype(
            int)
    final_database = pd.merge(
        left=final_database,
        right=egrid_facilities_w_fuel_region,
        left_on="eGRID_ID",
        right_on="FacilityID",
        how="left",
        suffixes=["", "_right"],
    )
    if model_specs.replace_egrid:
        primary_fuel_df = eia923_primary_fuel(year=model_specs.eia_gen_year)
        primary_fuel_df.rename(columns={'Plant Id': "eGRID_ID"}, inplace=True)
        primary_fuel_df["eGRID_ID"] = primary_fuel_df["eGRID_ID"].astype(int)
        key_df = (primary_fuel_df[[
            "eGRID_ID", "FuelCategory"
        ]].dropna().drop_duplicates(subset="eGRID_ID").set_index("eGRID_ID"))
        final_database["FuelCategory"] = final_database["eGRID_ID"].map(
            key_df["FuelCategory"])
    else:
        key_df = (final_database[[
            "eGRID_ID", "FuelCategory"
        ]].dropna().drop_duplicates(subset="eGRID_ID").set_index("eGRID_ID"))
        final_database.loc[final_database["FuelCategory"].isnull(),
                           "FuelCategory"] = final_database.loc[
                               final_database["FuelCategory"].isnull(),
                               "eGRID_ID"].map(key_df["FuelCategory"])
    # if replace_egrid:
    #     final_database["FuelCategory"].fillna(
    #         final_database["FuelCategory_right"], inplace=True
    #     )
    final_database["Final_fuel_agg"] = final_database["FuelCategory"]
    # if model_specs.use_primaryfuel_for_coal:
    #     final_database.loc[
    #         final_database["FuelCategory"] == "COAL", ["Final_fuel_agg"]
    #     ] = final_database.loc[
    #         final_database["FuelCategory"] == "COAL", "PrimaryFuel"
    #     ]
    try:
        year_filter = final_database["Year_x"] == final_database["Year_y"]
        final_database = final_database.loc[year_filter, :]
        final_database.drop(columns="Year_y", inplace=True)
    except KeyError:
        pass
    final_database.rename(columns={"Year_x": "Year"}, inplace=True)
    final_database = map_emissions_to_fedelemflows(final_database)
    dup_cols_check = [
        "FacilityID",
        "FuelCategory",
        "FlowName",
        "FlowAmount",
        "Compartment",
    ]
    final_database = final_database.loc[:,
                                        ~final_database.columns.duplicated()]
    final_database = final_database.drop_duplicates(subset=dup_cols_check)
    final_database.drop(
        columns=["FuelCategory", "FacilityID_x", "FacilityID_y"], inplace=True)
    final_database.rename(
        columns={
            "Final_fuel_agg": "FuelCategory",
            "TargetFlowUUID": "FlowUUID",
        },
        inplace=True,
    )
    final_database = add_temporal_correlation_score(
        final_database, model_specs.electricity_lci_target_year)
    final_database = add_technological_correlation_score(final_database)
    final_database["DataCollection"] = 5
    final_database["GeographicalCorrelation"] = 1

    final_database["eGRID_ID"] = final_database["eGRID_ID"].astype(int)

    final_database.sort_values(by=["eGRID_ID", "Compartment", "FlowName"],
                               inplace=True)
    final_database["stage_code"] = "Power plant"
    final_database["Compartment_path"] = final_database["Compartment"]
    final_database["Compartment"] = final_database["Compartment_path"].map(
        COMPARTMENT_DICT)
    final_database["Balancing Authority Name"] = final_database[
        "Balancing Authority Code"].map(ba_codes["BA_Name"])
    final_database["EIA_Region"] = final_database[
        "Balancing Authority Code"].map(ba_codes["EIA_Region"])
    final_database["FERC_Region"] = final_database[
        "Balancing Authority Code"].map(ba_codes["FERC_Region"])
    final_database = edits.check_for_edits(final_database, "generation.py",
                                           "create_generation_process_df")
    return final_database