def add_data_collection_score(db, elec_df, subregion="BA"): """ Adds the data collection score which is a function of how much of the total electricity generated in a subregion is captured by the denominator used in the final emission factor. Parameters ---------- db : datafrane Dataframe containing facility-level emissions as generated by create_generation_process_df. elec_df : dataframe Dataframe containing the totals for various subregion/source combinations. These are used as the denominators in the emissions factors subregion : str, optional The level of subregion that the data will be aggregated to. Choices are 'all', 'NERC', 'BA', 'US', by default 'BA' """ from electricitylci.dqi import data_collection_lower_bound_to_dqi from electricitylci.aggregation_selector import subregion_col region_agg = subregion_col(subregion) fuel_agg = ["FuelCategory"] if region_agg: groupby_cols = region_agg + fuel_agg + ["Year"] else: groupby_cols = fuel_agg + ["Year"] temp_df = db.merge( right=elec_df, left_on=groupby_cols + ["source_string"], right_on=groupby_cols + ["source_string"], how="left", ) reduced_db = db.drop_duplicates(subset=groupby_cols + ["eGRID_ID"]) region_elec = reduced_db.groupby(groupby_cols, as_index=False)["Electricity"].sum() region_elec.rename(columns={"Electricity": "region_fuel_electricity"}, inplace=True) temp_df = temp_df.merge( right=region_elec, left_on=groupby_cols, right_on=groupby_cols, how="left", ) db["Percent_of_Gen_in_EF_Denominator"] = ( temp_df["electricity_sum"] / temp_df["region_fuel_electricity"]) db["DataCollection"] = db["Percent_of_Gen_in_EF_Denominator"].apply( lambda x: lookup_score_with_bound_key( x, data_collection_lower_bound_to_dqi)) db = db.drop(columns="Percent_of_Gen_in_EF_Denominator") return db
def olcaschema_genprocess(database, upstream_dict={}, subregion="BA"): """Turns the give database containing generator facility emissions into dictionaries that contain the required data for insertion into an openLCA-compatible json-ld. Additionally, default providers for fuel inputs are mapped, using the information contained in the dictionary containing openLCA-formatted data for the fuels. Parameters ---------- database : dataframe Dataframe containing aggregated emissions to be turned into openLCA unit processes upstream_dict : dictionary, optional Dictionary as created by upstream_dict.py, containing the openLCA formatted data for all of the fuel inputs. This function will use the names and UUIDs from the entries to assign them as default providers. subregion : str, optional The subregion level of the aggregated data, by default "BA". See aggregation_selector.py for available subregions. Returns ------- dictionary: dictionary contaning openLCA-formatted data """ from electricitylci.process_dictionary_writer import ( unit, flow_table_creation, ref_exchange_creator, uncertainty_table_creation, process_doc_creation, ) from electricitylci.aggregation_selector import subregion_col region_agg = subregion_col(subregion) fuel_agg = ["FuelCategory"] if region_agg: base_cols = region_agg + fuel_agg else: base_cols = fuel_agg non_agg_cols = [ "stage_code", "FlowName", "FlowUUID", "Compartment", "Unit", "Year", "source_string", "TemporalCorrelation", "TechnologicalCorrelation", "GeographicalCorrelation", "DataCollection", "ReliabilityScore", "uncertaintyMin", "uncertaintyMax", "uncertaintyLognormParams", "Emission_factor", "GeomMean", "GeomSD", ] def turn_data_to_dict(data, upstream_dict): module_logger.debug( f"Turning flows from {data.name} into dictionaries") cols_for_exchange_dict = [ "internalId", "@type", "avoidedProduct", "flow", "flowProperty", "input", "quantitativeReference", "baseUncertainty", "provider", "amount", "amountFormula", "unit", "pedigreeUncertainty", "dqEntry", "uncertainty", "comment", ] year = ",".join(data["Year"].astype(str).unique()) datasources = ",".join(data["source_string"].astype(str).unique()) data["Maximum"] = data["uncertaintyMax"] data["Minimum"] = data["uncertaintyMin"] data["uncertainty"] = "" data["internalId"] = "" data["@type"] = "Exchange" data["avoidedProduct"] = False data["flowProperty"] = "" data["input"] = False input_filter = ( (data["Compartment"].str.lower().str.contains("input")) | (data["Compartment"].str.lower().str.contains("resource")) | (data["Compartment"].str.lower().str.contains("technosphere"))) data.loc[input_filter, "input"] = True data["baseUncertainty"] = "" data["provider"] = "" data["unit"] = data["Unit"] # data["ElementaryFlowPrimeContext"] = data["Compartment"] # default_unit = unit("kg") # data["unit"] = [default_unit] * len(data) data["FlowType"] = "ELEMENTARY_FLOW" product_filter = ( (data["Compartment"].str.lower().str.contains("technosphere")) | (data["Compartment"].str.lower().str.contains("valuable"))) data.loc[product_filter, "FlowType"] = "PRODUCT_FLOW" waste_filter = (( data["Compartment"].str.lower().str.contains("technosphere"))) data.loc[waste_filter, "FlowType"] = "WASTE_FLOW" data["flow"] = "" provider_filter = data["stage_code"].isin(upstream_dict.keys()) for index, row in data.loc[provider_filter, :].iterrows(): provider_dict = { "name": upstream_dict[getattr(row, "stage_code")]["name"], "categoryPath": upstream_dict[getattr(row, "stage_code")]["category"], "processType": "UNIT_PROCESS", "@id": upstream_dict[getattr(row, "stage_code")]["uuid"], } data.at[index, "provider"] = provider_dict data.at[index, "unit"] = unit(upstream_dict[getattr( row, "stage_code")]["q_reference_unit"]) data.at[index, "FlowType"] = "PRODUCT_FLOW" for index, row in data.iterrows(): data.at[index, "uncertainty"] = uncertainty_table_creation( data.loc[index:index, :]) data.at[index, "flow"] = flow_table_creation(data.loc[index:index, :]) data["amount"] = data["Emission_factor"] data["amountFormula"] = "" data["quantitativeReference"] = False data["dqEntry"] = ( "(" + str(round(data["ReliabilityScore"].iloc[0], 1)) + ";" + str(round(data["TemporalCorrelation"].iloc[0], 1)) + ";" + str(round(data["GeographicalCorrelation"].iloc[0], 1)) + ";" + str(round(data["TechnologicalCorrelation"].iloc[0], 1)) + ";" + str(round(data["DataCollection"].iloc[0], 1)) + ")") data["pedigreeUncertainty"] = "" data["comment"] = f"{datasources} - {year}" data_for_dict = data[cols_for_exchange_dict] data_for_dict = data_for_dict.append(ref_exchange_creator(), ignore_index=True) data_dict = data_for_dict.to_dict("records") return data_dict database_groupby = database.groupby(by=base_cols) process_df = pd.DataFrame(database_groupby[non_agg_cols].apply( turn_data_to_dict, (upstream_dict))) process_df.columns = ["exchanges"] process_df.reset_index(inplace=True) process_df["@type"] = "Process" process_df["allocationFactors"] = "" process_df["defaultAllocationMethod"] = "" process_df["location"] = "" process_df["parameters"] = "" # process_doc_dict = process_doc_creation(process_type) # process_df["processDocumentation"] = [process_doc_dict]*len(process_df) process_df["processType"] = "UNIT_PROCESS" process_df["category"] = ( "22: Utilities/2211: Electric Power Generation, Transmission and Distribution/" + process_df[fuel_agg].values) if region_agg is None: process_df["description"] = ( "Electricity from " + process_df[fuel_agg].values + " produced at generating facilities in the US.") process_df["name"] = ("Electricity - " + process_df[fuel_agg].values + " - US") else: process_df["description"] = ( "Electricity from " + process_df[fuel_agg].values + " produced at generating facilities in the " + process_df[region_agg].values + " region.") process_df["name"] = ("Electricity - " + process_df[fuel_agg].values + " - " + process_df[region_agg].values) process_df["description"] = ( process_df["description"] + " This process was created with ElectricityLCI " + "(https://github.com/USEPA/ElectricityLCI) version " + elci_version + " using the " + model_specs.model_name + " configuration.") process_df["version"] = make_valid_version_num(elci_version) process_df["processDocumentation"] = [ process_doc_creation(x) for x in list(process_df["FuelCategory"].str.lower()) ] process_cols = [ "@type", "allocationFactors", "defaultAllocationMethod", "exchanges", "location", "parameters", "processDocumentation", "processType", "name", "version", "category", "description", ] result = process_df[process_cols].to_dict("index") return result
def aggregate_data(total_db, subregion="BA"): """ Aggregates facility-level emissions to the specified subregion and calculates emission factors based on the total emission and total electricity generation. Parameters ---------- total_db : dataframe Facility-level emissions as generated by created by create_generation_process_df subregion : str, optional The level of subregion that the data will be aggregated to. Choices are 'all', 'NERC', 'BA', 'US', by default 'BA'. Returns ------- dataframe The dataframe provides the emissions aggregated to the specified subregion for each technology and stage in the input total_db. This dataframe includes an average emission factor and, when applicable uncertainty distributions. """ from electricitylci.aggregation_selector import subregion_col def geometric_mean(p_series, df, cols): # Alternatively we can use scipy.stats.lognorm to fit a distribution # and provide the parameters if (len(p_series) > 3) & (p_series.quantile(0.5) > 0): # result = gmean(p_series.to_numpy()+1)-1 module_logger.debug( f"Calculating confidence interval for" f"{df.loc[p_series.index[0],groupby_cols].values}") module_logger.debug(f"{p_series.values}") with np.errstate(all='raise'): try: data = p_series.to_numpy() except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with input data") return None try: log_data = np.log(data) except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with log function") return None try: mean = np.mean(log_data) except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with mean function") return None l = len(data) try: sd = np.std(log_data) / np.sqrt(l) sd2 = sd**2 except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with std function") return None try: pi1, pi2 = t.interval(alpha=0.90, df=l - 2, loc=mean, scale=sd) except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Problem with t function") return None try: upper_interval = np.max([ mean + sd2 / 2 + pi2 * np.sqrt(sd2 / l + sd2**2 / (2 * (l - 1))), mean + sd2 / 2 - pi2 * np.sqrt(sd2 / l + sd2**2 / (2 * (l - 1))), ]) except: module_logger.debug("Problem with interval function") return None try: result = (np.exp(mean), 0, np.exp(upper_interval)) except (ArithmeticError, ValueError, FloatingPointError): module_logger.debug("Unable to calculate geometric_mean") return None if result is not None: return result else: module_logger.debug( f"Problem generating uncertainty parameters \n" f"{df.loc[p_series.index[0],groupby_cols].values}\n" f"{p_series.values}" f"{p_series.values+1}") return None else: return None def calc_geom_std(df): if region_agg is not None: debug_string = f"{df[region_agg]}-{df['FuelCategory']}-{df['FlowName']}" else: debug_string = f"{df['FuelCategory']}-{df['FlowName']}" module_logger.debug(debug_string) if df["uncertaintyLognormParams"] is None: return None, None if isinstance(df["uncertaintyLognormParams"], str): params = ast.literal_eval(df["uncertaintyLognormParams"]) try: length = len(df["uncertaintyLognormParams"]) except TypeError: module_logger.info( f"Error calculating length of uncertaintyLognormParams" f"{df['uncertaintyLognormParams']}") return None, None if length != 3: module_logger.info( f"Error estimating standard deviation - length: {len(params)}") else: # In some cases, the final emission factor is far different than the # geometric mean of the individual emission factor. Depending on the # severity, this could be a clear sign of outliers having a large impact # on the final emission factor. When the uncertainty is generated for # these cases, the results can be nonsensical - hence we skip them. A more # agressive approach would be to re-assign the emission factor as well. if df["Emission_factor"] > df["uncertaintyLognormParams"][2]: return None, None else: c = np.log(df["uncertaintyLognormParams"][2]) - np.log( df["Emission_factor"]) b = -2**0.5 * erfinv(2 * 0.95 - 1) a = 0.5 sd1 = (-b + (b**2 - 4 * a * c)**0.5) / (2 * a) sd2 = (-b - (b**2 - 4 * a * c)**0.5) / (2 * a) if sd1 is not float("nan") and sd2 is not float("nan"): if sd1 < sd2: geostd = np.exp(sd1) geomean = np.exp( np.log(df["Emission_factor"]) - 0.5 * sd1**2) else: geostd = np.exp(sd2) geomean = np.exp( np.log(df["Emission_factor"]) - 0.5 * sd2**2) elif sd1 is not float("nan"): geostd = np.exp(sd1) geomean = np.exp( np.log(df["Emission_factor"]) - 0.5 * sd1**2) elif sd2 is not float("nan"): geostd = np.exp(sd2) geomean = np.exp( np.log(df["Emission_factor"]) - 0.5 * sd2**2) else: return None, None if ((geostd is np.inf) or (geostd is np.NINF) or (geostd is np.nan) or (geostd is float("nan")) or str(geostd) == "nan" or (geostd == 0)): return None, None return str(geomean), str(geostd) region_agg = subregion_col(subregion) fuel_agg = ["FuelCategory"] if region_agg: groupby_cols = ( region_agg + fuel_agg + ["stage_code", "FlowName", "Compartment", "FlowUUID", "Unit"]) elec_df_groupby_cols = (region_agg + fuel_agg + ["Year", "source_string"]) else: groupby_cols = fuel_agg + [ "stage_code", "FlowName", "Compartment", "FlowUUID", "Unit" ] elec_df_groupby_cols = fuel_agg + ["Year", "source_string"] if model_specs.replace_egrid: primary_fuel_df = eia923_primary_fuel(year=model_specs.eia_gen_year) primary_fuel_df.rename(columns={'Plant Id': "eGRID_ID"}, inplace=True) primary_fuel_df["eGRID_ID"] = primary_fuel_df["eGRID_ID"].astype(int) key_df = (primary_fuel_df[[ "eGRID_ID", "FuelCategory" ]].dropna().drop_duplicates(subset="eGRID_ID").set_index("eGRID_ID")) total_db.loc[total_db["FuelCategory"] != "ALL", "FuelCategory"] = total_db["eGRID_ID"].map( key_df["FuelCategory"]) total_db["FlowUUID"] = total_db["FlowUUID"].fillna(value="dummy-uuid") total_db = aggregate_facility_flows(total_db) total_db, electricity_df = calculate_electricity_by_source( total_db, subregion) total_db["FlowAmount"].replace(to_replace=0, value=1E-15, inplace=True) total_db = add_data_collection_score(total_db, electricity_df, subregion) total_db["facility_emission_factor"] = (total_db["FlowAmount"] / total_db["Electricity"]) total_db.dropna(subset=["facility_emission_factor"], inplace=True) def wtd_mean(pdser, total_db, cols): try: wts = total_db.loc[pdser.index, "FlowAmount"] result = np.average(pdser, weights=wts) except: module_logger.debug( f"Error calculating weighted mean for {pdser.name}-" f"likely from 0 FlowAmounts" # f"{total_db.loc[pdser.index[0],cols]}" ) try: with np.errstate(all='raise'): result = np.average(pdser) except ArithmeticError or ValueError or FloatingPointError: result = float("nan") return result wm = lambda x: wtd_mean(x, total_db, groupby_cols) geo_mean = lambda x: geometric_mean(x, total_db, groupby_cols) geo_mean.__name__ = "geo_mean" print( "Aggregating flow amounts, dqi information, and calculating uncertainty" ) database_f3 = total_db.groupby(groupby_cols + ["Year", "source_string"], as_index=False).agg({ "FlowAmount": ["sum", "count"], "TemporalCorrelation": wm, "TechnologicalCorrelation": wm, "GeographicalCorrelation": wm, "DataCollection": wm, "ReliabilityScore": wm, "facility_emission_factor": ["min", "max", geo_mean], }) database_f3.columns = groupby_cols + [ "Year", "source_string", "FlowAmount", "FlowAmountCount", "TemporalCorrelation", "TechnologicalCorrelation", "GeographicalCorrelation", "DataCollection", "ReliabilityScore", "uncertaintyMin", "uncertaintyMax", "uncertaintyLognormParams", ] criteria = database_f3["Compartment"] == "input" database_f3.loc[criteria, "uncertaintyLognormParams"] = None database_f3 = database_f3.merge( right=electricity_df, left_on=elec_df_groupby_cols, right_on=elec_df_groupby_cols, how="left", ) canadian_criteria = database_f3["FuelCategory"] == "ALL" if region_agg: canada_db = pd.merge( left=database_f3.loc[canadian_criteria, :], right=total_db[groupby_cols + ["Electricity"]], left_on=groupby_cols, right_on=groupby_cols, how="left", ).drop_duplicates(subset=groupby_cols) else: total_grouped = total_db.groupby(by=groupby_cols, as_index=False)["Electricity"].sum() canada_db = pd.merge( left=database_f3.loc[canadian_criteria, :], right=total_grouped, left_on=groupby_cols, right_on=groupby_cols, how="left", ) canada_db.index = database_f3.loc[canadian_criteria, :].index database_f3.loc[database_f3["FlowUUID"] == "dummy-uuid", "FlowUUID"] = float("nan") database_f3.loc[canada_db.index, "electricity_sum"] = canada_db["Electricity"] database_f3["Emission_factor"] = (database_f3["FlowAmount"] / database_f3["electricity_sum"]) # Infinite values generally coming from places with 0 generation. This happens # particularly with the Canadian mixes. database_f3["Emission_factor"].replace(to_replace=float("inf"), value=0, inplace=True) database_f3["Emission_factor"].replace(to_replace=float("-inf"), value=0, inplace=True) if region_agg is not None: database_f3["GeomMean"], database_f3["GeomSD"] = zip(*database_f3[[ "Emission_factor", "uncertaintyLognormParams", "uncertaintyMin", "uncertaintyMax", "FuelCategory", "FlowName" ] + region_agg].apply(calc_geom_std, axis=1)) else: database_f3["GeomMean"], database_f3["GeomSD"] = zip(*database_f3[[ "Emission_factor", "uncertaintyLognormParams", "uncertaintyMin", "uncertaintyMax", "FuelCategory", "FlowName" ]].apply(calc_geom_std, axis=1)) database_f3.sort_values(by=groupby_cols, inplace=True) return database_f3
def calculate_electricity_by_source(db, subregion="BA"): """ This function calculates the electricity totals by region and source using the same approach as the original generation.py with attempts made to speed it up. That is each flow will have a source associated with it (eGRID, NEI, TRI, RCRAInfo). To develop an emission factor, the FlowAmount will need to be divided by electricity generation. This routine sums all electricity generation for all source/subregion combinations. So if a subregion aggregates FlowAmounts source from NEI and TRI then the denominator will be all production from plants that reported into NEI or TRI for that subregion. Parameters ---------- db : dataframe Dataframe containing facility-level emissions as generated by create_generation_process_df. subregion : str, optional The level of subregion that the data will be aggregated to. Choices are 'all', 'NERC', 'BA', 'US', by default 'BA' """ from electricitylci.aggregation_selector import subregion_col all_sources = '_'.join(sorted(list(db["Source"].unique()))) power_plant_criteria = db["stage_code"] == "Power plant" db_powerplant = db.loc[power_plant_criteria, :] db_nonpower = db.loc[~power_plant_criteria, :] region_agg = subregion_col(subregion) fuel_agg = ["FuelCategory"] if region_agg: groupby_cols = (region_agg + fuel_agg + ["Year", "stage_code", "FlowName", "Compartment"]) elec_groupby_cols = region_agg + fuel_agg + ["Year"] else: groupby_cols = fuel_agg + [ "Year", "stage_code", "FlowName", "Compartment", ] elec_groupby_cols = fuel_agg + ["Year"] combine_source_by_flow = lambda x: _combine_sources( x, db, ["FlowName", "Compartment"], 1) combine_source_lambda = lambda x: _combine_sources(x, db_multiple_sources, groupby_cols) # power_db = db.loc[db["stage_code"]=='Power plant',:] # This is a pretty expensive process when we have to start looking at each # flow generated in each compartment for each balancing authority area. # To hopefully speed this up, we'll group by FlowName and Comparment and look # and try to eliminate flows where all sources are single entities. source_df = pd.DataFrame() source_df = pd.DataFrame( db_powerplant.groupby(["FlowName", "Compartment" ])[["Source"]].apply(combine_source_by_flow), columns=["source_list"], ) source_df[["source_list", "source_string" ]] = pd.DataFrame(source_df["source_list"].values.tolist(), index=source_df.index) source_df.reset_index(inplace=True) old_index = db_powerplant.index db_powerplant = db_powerplant.merge( right=source_df, left_on=["FlowName", "Compartment"], right_on=["FlowName", "Compartment"], how="left", ) db_powerplant.index = old_index db_multiple_sources = db_powerplant.loc[ db_powerplant["source_string"].isna(), :] if len(db_multiple_sources) > 0: source_df = pd.DataFrame( db_multiple_sources.groupby(groupby_cols)[[ "Source" ]].apply(combine_source_lambda), columns=["source_list"], ) source_df[["source_list", "source_string" ]] = pd.DataFrame(source_df["source_list"].values.tolist(), index=source_df.index) source_df.reset_index(inplace=True) db_multiple_sources.drop(columns=["source_list", "source_string"], inplace=True) old_index = db_multiple_sources.index db_multiple_sources = db_multiple_sources.merge( right=source_df, left_on=groupby_cols, right_on=groupby_cols, how="left", ) db_multiple_sources.index = old_index # db[["source_string","source_list"]].fillna(db_multiple_sources[["source_string","source_list"]],inplace=True) db_powerplant.loc[ db_powerplant["source_string"].isna(), ["source_string", "source_list"]] = db_multiple_sources[[ "source_string", "source_list" ]] unique_source_lists = list(db_powerplant["source_string"].unique()) # unique_source_lists = [x for x in unique_source_lists if ((str(x) != "nan")&(str(x)!="netl"))] unique_source_lists = [ x for x in unique_source_lists if ((str(x) != "nan")) ] # One set of emissions passed into this routine may be life cycle emissions # used as proxies for Canadian generation. In those cases the electricity # generation will be equal to the Electricity already in the dataframe. elec_sum_lists = list() unique_source_lists = unique_source_lists + [all_sources] for src in unique_source_lists: module_logger.info(f"Calculating electricity for {src}") # src_filter = db.apply(lambda x: x["Source"] in src, axis=1) db["temp_src"] = src src_filter = [ a in b for a, b in zip(db["Source"].values.tolist(), db["temp_src"].values.tolist()) ] # total_filter = ~fuelcat_all & src_filter sub_db = db.loc[src_filter, :] sub_db.drop_duplicates(subset=fuel_agg + ["eGRID_ID"], inplace=True) sub_db_group = sub_db.groupby(elec_groupby_cols, as_index=False).agg({ "Electricity": [np.sum, np.mean], "eGRID_ID": "count" }) sub_db_group.columns = elec_groupby_cols + [ "electricity_sum", "electricity_mean", "facility_count", ] # zero_elec_filter = sub_db_group["electricity_sum"]==0 sub_db_group["source_string"] = src elec_sum_lists.append(sub_db_group) db_nonpower["source_string"] = all_sources db_nonpower["source_list"] = [all_sources] * len(db_nonpower) elec_sums = pd.concat(elec_sum_lists, ignore_index=True) elec_sums.sort_values(by=elec_groupby_cols, inplace=True) db = pd.concat([db_powerplant, db_nonpower]) return db, elec_sums
def generate_regional_grid_loss(final_database, year, subregion="all"): """This function generates transmission and distribution losses for the provided generation data and given year, aggregated by subregion. Arguments: final_database: dataframe The database containing plant-level emissions. year: int Analysis year for the transmission and distribution loss data. Ideally this should match the year of your final_database. Returns: td_by_region: dataframe A dataframe of transmission and distribution loss rates as a fraction. This dataframe can be used to generate unit processes for transmission and distribution to match the regionally- aggregated emissions unit processes. """ print("Generating factors for transmission and distribution losses") from electricitylci.eia923_generation import build_generation_data from electricitylci.combinator import ba_codes from electricitylci.egrid_facilities import egrid_facilities td_calc_columns = [ "State", "NERC", "FuelCategory", "PrimaryFuel", "NERC", "Balancing Authority Name", "Electricity", "Year", "Subregion", "FRS_ID", "eGRID_ID", ] # plant_generation = final_database[td_calc_columns].drop_duplicates() egrid_facilities_w_fuel_region = egrid_facilities[[ "FacilityID", "Subregion", "PrimaryFuel", "FuelCategory", "NERC", "PercentGenerationfromDesignatedFuelCategory", "Balancing Authority Name", "Balancing Authority Code", "State" ]] egrid_facilities_w_fuel_region[ "FacilityID"] = egrid_facilities_w_fuel_region["FacilityID"].astype( int) plant_generation = build_generation_data(generation_years=[year]) plant_generation["FacilityID"] = plant_generation["FacilityID"].astype(int) plant_generation = plant_generation.merge(egrid_facilities_w_fuel_region, on=["FacilityID"], how="left") plant_generation["Balancing Authority Name"] = plant_generation[ "Balancing Authority Code"].map(ba_codes["BA_Name"]) plant_generation["FERC_Region"] = plant_generation[ "Balancing Authority Code"].map(ba_codes["FERC_Region"]) plant_generation["EIA_Region"] = plant_generation[ "Balancing Authority Code"].map(ba_codes["EIA_Region"]) td_rates = eia_trans_dist_download_extract(f"{year}") td_by_plant = pd.merge( left=plant_generation, right=td_rates, left_on="State", right_index=True, how="left", ) td_by_plant.dropna(subset=["t_d_losses"], inplace=True) td_by_plant["t_d_losses"] = td_by_plant["t_d_losses"].astype(float) from electricitylci.aggregation_selector import subregion_col aggregation_column = subregion_col(subregion) wm = lambda x: np.average(x, weights=td_by_plant.loc[x.index, "Electricity"]) if aggregation_column is not None: td_by_region = td_by_plant.groupby( aggregation_column, as_index=False).agg({"t_d_losses": wm}) else: td_by_region = pd.DataFrame(td_by_plant.agg({"t_d_losses": wm}), columns=["t_d_losses"]) td_by_region["Region"] = "US" return td_by_region
def aggregate_data(total_db, subregion="BA"): """ Aggregates facility-level emissions to the specified subregion and calculates emission factors based on the total emission and total electricity generation. Parameters ---------- total_db : dataframe Facility-level emissions as generated by created by create_generation_process_df subregion : str, optional The level of subregion that the data will be aggregated to. Choices are 'all', 'NERC', 'BA', 'US', by default 'BA'. """ from electricitylci.aggregation_selector import subregion_col def geometric_mean(p_series, df, cols): # I think I actually need to replace this with the function contained in # process_exchange_aggregator_uncertainty.py. The approach to add 1 will # also lead to some large errors when dealing with small numbers. # Alternatively we can use scipy.stats.lognorm to fit a distribution # and provide the parameters if (len(p_series) > 3) & (p_series.quantile(0.5) > 0): # result = gmean(p_series.to_numpy()+1)-1 module_logger.debug( f"Calculating confidence interval for" f"{df.loc[p_series.index[0],groupby_cols].values}") module_logger.debug(f"{p_series.values}") with np.errstate(all='raise'): try: data = p_series.to_numpy() except ArithmeticError or ValueError or FloatingPointError: module_logger.debug("Problem with input data") return None try: log_data = np.log(data) except ArithmeticError or ValueError or FloatingPointError: module_logger.debug("Problem with log function") return None try: mean = np.mean(log_data) except ArithmeticError or ValueError or FloatingPointError: module_logger.debug("Problem with mean function") return None l = len(data) try: sd = np.std(log_data) sd2 = sd**2 except ArithmeticError or ValueError or FloatingPointError: module_logger.debug("Problem with std function") return None try: pi1, pi2 = t.interval(alpha=0.90, df=l - 2, loc=mean, scale=sd) except ArithmeticError or ValueError or FloatingPointError: module_logger.debug("Problem with t function") return None try: upper_interval = np.max([ mean + sd2 / 2 + pi2 * np.sqrt(sd2 / l + sd2**2 / (2 * (l - 1))), mean + sd2 / 2 - pi2 * np.sqrt(sd2 / l + sd2**2 / (2 * (l - 1))), ]) except: module_logger.debug("Problem with interval function") return None try: result = (np.exp(mean), 0, np.exp(upper_interval)) except ArithmeticError or ValueError or FloatingPointError: print("Prolem with result") return None if result is not None: return result else: module_logger.debug( f"Problem generating uncertainty parameters \n" f"{df.loc[p_series.index[0],groupby_cols].values}\n" f"{p_series.values}" f"{p_series.values+1}") return None else: return None def calc_geom_std(df): if df["uncertaintyLognormParams"] is None: return None, None if isinstance(df["uncertaintyLognormParams"], str): params = ast.literal_eval(df["uncertaintyLognormParams"]) try: length = len(df["uncertaintyLognormParams"]) except TypeError: module_logger.info( f"Error calculating length of uncertaintyLognormParams" f"{df['uncertaintyLognormParams']}") return None, None if length != 3: module_logger.info( f"Error estimating standard deviation - length: {len(params)}") try: geomean = df["Emission_factor"] geostd = np.exp((np.log(df["uncertaintyLognormParams"][2]) - np.log(df["Emission_factor"])) / norm.ppf(0.95)) except ArithmeticError: module_logger.info("Error estimating standard deviation") return None, None if ((geostd is np.inf) or (geostd is np.NINF) or (geostd is np.nan) or (geostd is float("nan")) or str(geostd) == "nan"): return None, None if geostd * geomean > df["uncertaintyMax"]: return None, None return str(geomean), str(geostd) region_agg = subregion_col(subregion) fuel_agg = ["FuelCategory"] if region_agg: groupby_cols = (region_agg + fuel_agg + ["stage_code", "FlowName", "Compartment", "FlowUUID"]) elec_df_groupby_cols = (region_agg + fuel_agg + ["Year", "source_string"]) else: groupby_cols = fuel_agg + [ "stage_code", "FlowName", "Compartment", "FlowUUID", ] elec_df_groupby_cols = fuel_agg + ["Year", "source_string"] total_db["FlowUUID"] = total_db["FlowUUID"].fillna(value="dummy-uuid") total_db = aggregate_facility_flows(total_db) total_db, electricity_df = calculate_electricity_by_source( total_db, subregion) total_db = add_data_collection_score(total_db, electricity_df, subregion) total_db["facility_emission_factor"] = (total_db["FlowAmount"] / total_db["Electricity"]) total_db.dropna(subset=["facility_emission_factor"], inplace=True) def wtd_mean(pdser, total_db, cols): try: wts = total_db.loc[pdser.index, "Electricity"] result = np.average(pdser, weights=wts) except: module_logger.info( f"Error calculating weighted mean for {pdser.name}-" f"{total_db.loc[pdser.index[0],cols]}") result = float("nan") return result wm = lambda x: wtd_mean(x, total_db, groupby_cols) geo_mean = lambda x: geometric_mean(x, total_db, groupby_cols) geo_mean.__name__ = "geo_mean" print( "Aggregating flow amounts, dqi information, and calculating uncertainty" ) database_f3 = total_db.groupby(groupby_cols + ["Year", "source_string"], as_index=False).agg({ "FlowAmount": ["sum", "count"], "TemporalCorrelation": wm, "TechnologicalCorrelation": wm, "GeographicalCorrelation": wm, "DataCollection": wm, "ReliabilityScore": wm, "facility_emission_factor": ["min", "max", geo_mean], }) database_f3.columns = groupby_cols + [ "Year", "source_string", "FlowAmount", "FlowAmountCount", "TemporalCorrelation", "TechnologicalCorrelation", "GeographicalCorrelation", "DataCollection", "ReliabilityScore", "uncertaintyMin", "uncertaintyMax", "uncertaintyLognormParams", ] criteria = database_f3["Compartment"] == "input" database_f3.loc[criteria, "uncertaintyLognormParams"] = None database_f3 = database_f3.merge( right=electricity_df, left_on=elec_df_groupby_cols, right_on=elec_df_groupby_cols, how="left", ) canadian_criteria = database_f3["FuelCategory"] == "ALL" if region_agg: canada_db = pd.merge( left=database_f3.loc[canadian_criteria, :], right=total_db[groupby_cols + ["Electricity"]], left_on=groupby_cols, right_on=groupby_cols, how="left", ).drop_duplicates(subset=groupby_cols) else: total_grouped = total_db.groupby(by=groupby_cols, as_index=False)["Electricity"].sum() canada_db = pd.merge( left=database_f3.loc[canadian_criteria, :], right=total_grouped, left_on=groupby_cols, right_on=groupby_cols, how="left", ) canada_db.index = database_f3.loc[canadian_criteria, :].index database_f3.loc[database_f3["FlowUUID"] == "dummy-uuid", "FlowUUID"] = float("nan") database_f3.loc[canada_db.index, "electricity_sum"] = canada_db["Electricity"] database_f3["Emission_factor"] = (database_f3["FlowAmount"] / database_f3["electricity_sum"]) database_f3["GeomMean"], database_f3["GeomSD"] = zip(*database_f3[[ "Emission_factor", "uncertaintyLognormParams", "uncertaintyMin", "uncertaintyMax", ]].apply(calc_geom_std, axis=1)) database_f3.sort_values(by=groupby_cols, inplace=True) return database_f3