def aggregator(df, groupbycols): """ Aggregates flowbyactivity or flowbysector 'FlowAmount' column in df and generate weighted average values based on FlowAmount values for numeric columns :param df: df, Either flowbyactivity or flowbysector :param groupbycols: list, Either flowbyactivity or flowbysector columns :return: df, with aggregated columns """ # reset index df = df.reset_index(drop=True) # tmp replace null values with empty cells df = replace_NoneType_with_empty_cells(df) # drop columns with flowamount = 0 df = df[df['FlowAmount'] != 0] # list of column headers, that if exist in df, should be # aggregated using the weighted avg fxn possible_column_headers = \ ('Spread', 'Min', 'Max', 'DataReliability', 'TemporalCorrelation', 'GeographicalCorrelation', 'TechnologicalCorrelation', 'DataCollection') # list of column headers that do exist in the df being aggregated column_headers = [ e for e in possible_column_headers if e in df.columns.values.tolist() ] df_dfg = df.groupby(groupbycols).agg({'FlowAmount': ['sum']}) # run through other columns creating weighted average for e in column_headers: df_dfg[e] = get_weighted_average(df, e, 'FlowAmount', groupbycols) df_dfg = df_dfg.reset_index() df_dfg.columns = df_dfg.columns.droplevel(level=1) # if datatypes are strings, ensure that Null values remain NoneType df_dfg = replace_strings_with_NoneType(df_dfg) return df_dfg
def aggregate(df, grouping_vars=None): """Aggregate a 'FlowAmount' in a dataframe based on the passed grouping_vars and generating a weighted average for data quality fields. :param df: dataframe to aggregate :param grouping_vars: list of df column headers on which to groupby :return: aggregated dataframe with weighted average data reliability score """ if grouping_vars is None: grouping_vars = [ x for x in df.columns if x not in ['FlowAmount', 'DataReliability'] ] df_agg = df.groupby(grouping_vars).agg({'FlowAmount': ['sum']}) df_agg['DataReliability'] = get_weighted_average(df, 'DataReliability', 'FlowAmount', grouping_vars) df_agg = df_agg.reset_index() df_agg.columns = df_agg.columns.droplevel(level=1) # drop those rows where flow amount is negative, zero, or NaN df_agg = df_agg[df_agg['FlowAmount'] > 0] df_agg = df_agg[df_agg['FlowAmount'].notna()] return df_agg
def prepare_stewi_fbs(df, inventory_dict, NAICS_level, geo_scale): """ Function to prepare an emissions df from stewi or stewicombo for use as FBS :param df: a dataframe of emissions and mapped faciliites from stewi or stewicombo :param inventory_dict: a dictionary of inventory types and years (e.g., {'NEI':'2017', 'TRI':'2017'}) :param NAICS_level: desired NAICS aggregation level, using sector_level_key, should match target_sector_level :param geo_scale: desired geographic aggregation level ('national', 'state', 'county'), should match target_geoscale :return: df """ # update location to appropriate geoscale prior to aggregating df.dropna(subset=['Location'], inplace=True) df['Location'] = df['Location'].astype(str) df = update_geoscale(df, geo_scale) # assign grouping variables based on desired geographic aggregation level grouping_vars = ['NAICS_lvl', 'FlowName', 'Compartment', 'Location'] if 'MetaSources' in df: grouping_vars.append('MetaSources') # aggregate by NAICS code, FlowName, compartment, and geographic level fbs = df.groupby(grouping_vars).agg({ 'FlowAmount': 'sum', 'Year': 'first', 'Unit': 'first' }) # add reliability score fbs['DataReliability'] = get_weighted_average(df, 'DataReliability', 'FlowAmount', grouping_vars) fbs.reset_index(inplace=True) # apply flow mapping separately for elementary and waste flows fbs['FlowType'] = 'ELEMENTARY_FLOW' fbs.loc[fbs['MetaSources'] == 'RCRAInfo', 'FlowType'] = 'WASTE_FLOW' # Add 'SourceName' for mapping purposes fbs['SourceName'] = fbs['MetaSources'] fbs_elem = fbs.loc[fbs['FlowType'] == 'ELEMENTARY_FLOW'] fbs_waste = fbs.loc[fbs['FlowType'] == 'WASTE_FLOW'] fbs_list = [] if len(fbs_elem) > 0: fbs_elem = map_flows(fbs_elem, list(inventory_dict.keys()), flow_type='ELEMENTARY_FLOW') fbs_list.append(fbs_elem) if len(fbs_waste) > 0: fbs_waste = map_flows(fbs_waste, list(inventory_dict.keys()), flow_type='WASTE_FLOW') fbs_list.append(fbs_waste) if len(fbs_list) == 1: fbs_mapped = fbs_list[0] else: fbs_mapped = pd.concat[fbs_list].reset_index(drop=True) # rename columns to match flowbysector format fbs_mapped = fbs_mapped.rename(columns={"NAICS_lvl": "SectorProducedBy"}) # add hardcoded data, depending on the source data, # some of these fields may need to change fbs_mapped['Class'] = 'Chemicals' fbs_mapped['SectorConsumedBy'] = 'None' fbs_mapped['SectorSourceName'] = 'NAICS_2012_Code' fbs_mapped = assign_fips_location_system(fbs_mapped, list(inventory_dict.values())[0]) # add missing flow by sector fields fbs_mapped = add_missing_flow_by_fields(fbs_mapped, flow_by_sector_fields) fbs_mapped = check_for_missing_sector_data(fbs_mapped, NAICS_level) # sort dataframe and reset index fbs_mapped = fbs_mapped.sort_values(list( flow_by_sector_fields.keys())).reset_index(drop=True) # check the sector codes to make sure NAICS 2012 codes fbs_mapped = replace_naics_w_naics_from_another_year( fbs_mapped, 'NAICS_2012_Code') return fbs_mapped