コード例 #1
0
def aggregator(df, groupbycols):
    """
    Aggregates flowbyactivity or flowbysector 'FlowAmount' column in df and
    generate weighted average values based on FlowAmount values for numeric
    columns
    :param df: df, Either flowbyactivity or flowbysector
    :param groupbycols: list, Either flowbyactivity or flowbysector columns
    :return: df, with aggregated columns
    """

    # reset index
    df = df.reset_index(drop=True)
    # tmp replace null values with empty cells
    df = replace_NoneType_with_empty_cells(df)

    # drop columns with flowamount = 0
    df = df[df['FlowAmount'] != 0]

    # list of column headers, that if exist in df, should be
    # aggregated using the weighted avg fxn
    possible_column_headers = \
        ('Spread', 'Min', 'Max', 'DataReliability', 'TemporalCorrelation',
         'GeographicalCorrelation', 'TechnologicalCorrelation',
         'DataCollection')

    # list of column headers that do exist in the df being aggregated
    column_headers = [
        e for e in possible_column_headers if e in df.columns.values.tolist()
    ]

    df_dfg = df.groupby(groupbycols).agg({'FlowAmount': ['sum']})

    # run through other columns creating weighted average
    for e in column_headers:
        df_dfg[e] = get_weighted_average(df, e, 'FlowAmount', groupbycols)

    df_dfg = df_dfg.reset_index()
    df_dfg.columns = df_dfg.columns.droplevel(level=1)

    # if datatypes are strings, ensure that Null values remain NoneType
    df_dfg = replace_strings_with_NoneType(df_dfg)

    return df_dfg
コード例 #2
0
def aggregate(df, grouping_vars=None):
    """Aggregate a 'FlowAmount' in a dataframe based on the passed grouping_vars
    and generating a weighted average for data quality fields.

    :param df: dataframe to aggregate
    :param grouping_vars: list of df column headers on which to groupby
    :return: aggregated dataframe with weighted average data reliability score
    """
    if grouping_vars is None:
        grouping_vars = [
            x for x in df.columns
            if x not in ['FlowAmount', 'DataReliability']
        ]
    df_agg = df.groupby(grouping_vars).agg({'FlowAmount': ['sum']})
    df_agg['DataReliability'] = get_weighted_average(df, 'DataReliability',
                                                     'FlowAmount',
                                                     grouping_vars)
    df_agg = df_agg.reset_index()
    df_agg.columns = df_agg.columns.droplevel(level=1)
    # drop those rows where flow amount is negative, zero, or NaN
    df_agg = df_agg[df_agg['FlowAmount'] > 0]
    df_agg = df_agg[df_agg['FlowAmount'].notna()]
    return df_agg
コード例 #3
0
def prepare_stewi_fbs(df, inventory_dict, NAICS_level, geo_scale):
    """
    Function to prepare an emissions df from stewi or stewicombo for use as FBS
    :param df: a dataframe of emissions and mapped faciliites from stewi
                or stewicombo
    :param inventory_dict: a dictionary of inventory types and years (e.g.,
                {'NEI':'2017', 'TRI':'2017'})
    :param NAICS_level: desired NAICS aggregation level, using
        sector_level_key, should match target_sector_level
    :param geo_scale: desired geographic aggregation level
        ('national', 'state', 'county'), should match target_geoscale
    :return: df
    """
    # update location to appropriate geoscale prior to aggregating
    df.dropna(subset=['Location'], inplace=True)
    df['Location'] = df['Location'].astype(str)
    df = update_geoscale(df, geo_scale)

    # assign grouping variables based on desired geographic aggregation level
    grouping_vars = ['NAICS_lvl', 'FlowName', 'Compartment', 'Location']
    if 'MetaSources' in df:
        grouping_vars.append('MetaSources')

    # aggregate by NAICS code, FlowName, compartment, and geographic level
    fbs = df.groupby(grouping_vars).agg({
        'FlowAmount': 'sum',
        'Year': 'first',
        'Unit': 'first'
    })

    # add reliability score
    fbs['DataReliability'] = get_weighted_average(df, 'DataReliability',
                                                  'FlowAmount', grouping_vars)
    fbs.reset_index(inplace=True)

    # apply flow mapping separately for elementary and waste flows
    fbs['FlowType'] = 'ELEMENTARY_FLOW'
    fbs.loc[fbs['MetaSources'] == 'RCRAInfo', 'FlowType'] = 'WASTE_FLOW'

    # Add 'SourceName' for mapping purposes
    fbs['SourceName'] = fbs['MetaSources']
    fbs_elem = fbs.loc[fbs['FlowType'] == 'ELEMENTARY_FLOW']
    fbs_waste = fbs.loc[fbs['FlowType'] == 'WASTE_FLOW']
    fbs_list = []
    if len(fbs_elem) > 0:
        fbs_elem = map_flows(fbs_elem,
                             list(inventory_dict.keys()),
                             flow_type='ELEMENTARY_FLOW')
        fbs_list.append(fbs_elem)
    if len(fbs_waste) > 0:
        fbs_waste = map_flows(fbs_waste,
                              list(inventory_dict.keys()),
                              flow_type='WASTE_FLOW')
        fbs_list.append(fbs_waste)

    if len(fbs_list) == 1:
        fbs_mapped = fbs_list[0]
    else:
        fbs_mapped = pd.concat[fbs_list].reset_index(drop=True)

    # rename columns to match flowbysector format
    fbs_mapped = fbs_mapped.rename(columns={"NAICS_lvl": "SectorProducedBy"})

    # add hardcoded data, depending on the source data,
    # some of these fields may need to change
    fbs_mapped['Class'] = 'Chemicals'
    fbs_mapped['SectorConsumedBy'] = 'None'
    fbs_mapped['SectorSourceName'] = 'NAICS_2012_Code'

    fbs_mapped = assign_fips_location_system(fbs_mapped,
                                             list(inventory_dict.values())[0])

    # add missing flow by sector fields
    fbs_mapped = add_missing_flow_by_fields(fbs_mapped, flow_by_sector_fields)

    fbs_mapped = check_for_missing_sector_data(fbs_mapped, NAICS_level)

    # sort dataframe and reset index
    fbs_mapped = fbs_mapped.sort_values(list(
        flow_by_sector_fields.keys())).reset_index(drop=True)

    # check the sector codes to make sure NAICS 2012 codes
    fbs_mapped = replace_naics_w_naics_from_another_year(
        fbs_mapped, 'NAICS_2012_Code')

    return fbs_mapped