def allocate_usda_ers_mlu_other_land(df, attr, fbs_list): """ From the USDA ERS MLU 2012 report: "Includes miscellaneous other uses, such as industrial and commercial sites in rural areas, cemeteries, golf courses, mining areas, quarry sites, marshes, swamps, sand dunes, bare rocks, deserts, tundra, rural residential, and other unclassified land. In this report, urban land is reported as a separate category." Mining data is calculated using a separate source = BLM PLS. Want to extract rural residential land area from total value of 'Other Land' :param df: :param attr: :param fbs_list: :return: """ from flowsa.values_from_literature import get_area_of_rural_land_occupied_by_houses_2013 from flowsa.common import load_household_sector_codes # land in rural residential lots rural_res = get_area_of_rural_land_occupied_by_houses_2013() # household codes household = load_household_sector_codes() household = household['Code'].drop_duplicates().tolist() # in df, where sector is a personal expenditure value, and location = 00000, replace with rural res value df['FlowAmount'] = np.where(df['SectorConsumedBy'].isin(household), rural_res, df['FlowAmount']) return df
def allocate_usda_ers_mlu_other_land(df, attr, fbs_list): """ From the USDA ERS MLU 2012 report: "Includes miscellaneous other uses, such as industrial and commercial sites in rural areas, cemeteries, golf courses, mining areas, quarry sites, marshes, swamps, sand dunes, bare rocks, deserts, tundra, rural residential, and other unclassified land. In this report, urban land is reported as a separate category." Mining data is calculated using a separate source = BLM PLS. Want to extract rural residential land area from total value of 'Other Land' :param df: df, USDA ERA MLU Land :param attr: dictionary, attribute data from method yaml for activity set :param fbs_list: list, FBS dfs for activities created prior to the activity set that calls on this fxn :return: df, allocated USDS ERS MLU Land, FBS format """ # land in rural residential lots rural_res = get_area_of_rural_land_occupied_by_houses_2013() # household codes household = load_household_sector_codes() household = household['Code'].drop_duplicates().tolist() # in df, where sector is a personal expenditure value, and # location = 00000, replace with rural res value vLogDetailed.info( 'The only category for MLU other land use is rural land ' 'occupation. All other land area in this category is unassigned to' 'sectors, resulting in unaccounted land area.') df['FlowAmount'] = np.where(df['SectorConsumedBy'].isin(household), rural_res, df['FlowAmount']) return df
def add_non_naics_sectors(sector_list, sector_level): # load non-NAICS sectors used with NAICS household = load_household_sector_codes() household = household.loc[household['NAICS_Level_to_Use_For'] == sector_level] # add household sector to sector list sector_list.extend(household['Code'].tolist()) # add "None" to sector list so don't lose rows when filtering df to match sector length # sector_list.extend(["None"]) return sector_list
def expand_naics_list(df, sectorsourcename): """ Add disaggregated sectors to the crosswalks. :param df: :param sectorsourcename: :return: """ # load master crosswalk cw = load_sector_crosswalk() sectors = cw.loc[:, [sectorsourcename]] # drop duplicates sectors = sectors.drop_duplicates().dropna() # add non-naics to sector list household = load_household_sector_codes() household = pd.DataFrame(household['Code'].drop_duplicates()) household.columns = [sectorsourcename] sectors = sectors.append( household, sort=False).drop_duplicates().reset_index(drop=True) # drop rows that contain hyphenated sectors sectors = sectors[~sectors[sectorsourcename].str. contains("-")].reset_index(drop=True) # Ensure 'None' not added to sectors sectors = sectors[sectors[sectorsourcename] != "None"] # create list of sectors that exist in original df, which, if created when expanding sector list cannot be added existing_sectors = df[['Sector']] existing_sectors = existing_sectors.drop_duplicates() naics_df = pd.DataFrame([]) for i in existing_sectors['Sector']: dig = len(str(i)) n = sectors.loc[sectors[sectorsourcename].apply(lambda x: x[0:dig]) == i] if len(n) != 0: n = n.assign(Sector=i) naics_df = naics_df.append(n) # merge df to retain activityname/sectortype info naics_expanded = df.merge(naics_df, how='left') # drop column of aggregated naics and rename column of disaggregated naics naics_expanded = naics_expanded.drop(columns=["Sector"]) naics_expanded = naics_expanded.rename( columns={sectorsourcename: 'Sector'}) # drop duplicates and rearrange df columns naics_expanded = naics_expanded.drop_duplicates() naics_expanded = naics_expanded[[ 'ActivitySourceName', 'Activity', 'Sector', 'SectorType' ]] return naics_expanded
# concat df list and drop duplications missing_naics_df = pd.concat( missing_naics_df_list, ignore_index=True, sort=False).drop_duplicates().reset_index(drop=True) missing_naics_df = missing_naics_df[ missing_naics_df['NAICS_2012_Code'] != 'None'] # sort df missing_naics_df = missing_naics_df.sort_values( ['NAICS_2012_Code', 'NAICS_2007_Code']) missing_naics_df = missing_naics_df.reset_index(drop=True) # add missing naics to master naics crosswalk total_naics = naics.append(missing_naics_df, ignore_index=True) # append household codes household = load_household_sector_codes() h = household['Code'].drop_duplicates().tolist() for i in h: if (total_naics['NAICS_2012_Code'] != i).all(): total_naics = total_naics.append( { 'NAICS_2007_Code': np.nan, 'NAICS_2012_Code': i, 'NAICS_2017_Code': np.nan }, ignore_index=True) # sort df total_naics = total_naics.sort_values(['NAICS_2012_Code', 'NAICS_2007_Code']).drop_duplicates() total_naics = total_naics[~total_naics['NAICS_2012_Code'].isin(
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of water data and allocation datasets fbas = method['flowbyactivity_sources'] # Create empty list for storing fbs files fbss = [] for k, v in fbas.items(): # pull water data for allocation log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k) # if necessary, standardize names in data set if v['activity_name_standardization_fxn'] != 'None': log.info("Standardizing activity names in " + k) flows = getattr(sys.modules[__name__], v['activity_name_standardization_fxn'])(flows) # drop description field flows = flows.drop(columns='Description') # fill null values flows = flows.fillna(value=fba_fill_na_dict) # map df to elementary flows - commented out until mapping complete # log.info("Mapping flows in " + k + ' to federal elementary flow list') # flows_mapped = map_elementary_flows(flows, k) # convert unit todo: think about unit conversion here log.info("Converting units in " + k) flows = convert_unit(flows) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] for aset, attr in activities.items(): # subset by named activities names = [attr['names']] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # subset usgs data by activity flow_subset = flows[(flows[fba_activity_fields[0]].isin(names)) | (flows[fba_activity_fields[1]].isin(names))] # Reset index values after subset flow_subset = flow_subset.reset_index(drop=True) # check if flowbyactivity data exists at specified geoscale to use log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at the " + v['geoscale_to_use'] + ' level') geocheck = check_if_data_exists_at_geoscale(flow_subset, names, v['geoscale_to_use']) # aggregate geographically to the scale of the allocation dataset if geocheck == "Yes": activity_from_scale = v['geoscale_to_use'] else: # if activity does not exist at specified geoscale, issue warning and use data at less aggregated # geoscale, and sum to specified geoscale log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at a less aggregated level") new_geoscale_to_use = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, names, v['geoscale_to_use']) activity_from_scale = new_geoscale_to_use activity_to_scale = attr['allocation_from_scale'] # if usgs is less aggregated than allocation df, aggregate usgs activity to target scale if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]: log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, names) # else, aggregate to geoscale want to use elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]: log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use']) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, filter relevant rows else: log.info("Filtering out " + activity_from_scale + " data") flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, names) # location column pad zeros if necessary flow_subset['Location'] = flow_subset['Location'].apply(lambda x: x.ljust(3 + len(x), '0') if len(x) < 5 else x ) # Add sectors to usgs activity, creating two versions of the flow subset # the first version "flow_subset" is the most disaggregated version of the Sectors (NAICS) # the second version, "flow_subset_agg" includes only the most aggregated level of sectors log.info("Adding sectors to " + k + " for " + ', '.join(map(str, names))) flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source']) flow_subset_wsec_agg = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source'], levelofSectoragg='agg') # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': fbs = flow_subset_wsec_agg.copy() else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year']]).reset_index(drop=True) # fill null values fba_allocation = fba_allocation.fillna(value=fba_fill_na_dict) # convert unit fba_allocation = convert_unit(fba_allocation) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin(attr['allocation_compartment'])] # reset index fba_allocation = fba_allocation.reset_index(drop=True) # check if allocation data exists at specified geoscale to use log.info("Checking if" + " allocation data exists for " + ', '.join(map(str, names)) + " at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale(fba_allocation, names, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary from_scale = attr['allocation_from_scale'] to_scale = v['geoscale_to_use'] # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale if fips_number_key[from_scale] > fips_number_key[to_scale]: fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale else: fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation = add_sectors_to_flowbyactivity(fba_allocation, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr[ 'allocation_sector_aggregation']) # subset fba datsets to only keep the naics associated with usgs activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names) # Reset index values after subset fba_allocation_subset = fba_allocation_subset.reset_index(drop=True) # generalize activity field names to enable link to water withdrawal table log.info("Generalizing activity names in subset of " + attr['allocation_source']) fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset) # drop columns fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity']) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info("Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr) # create flow allocation ratios log.info("Creating allocation ratios for " + attr['allocation_source']) flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method']) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_wsec = flow_subset_wsec.loc[ (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation dataframes use the same location systems") check_if_location_systems_match(flow_subset_wsec, flow_allocation) # merge water withdrawal df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_wsec.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorProducedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') fbs = fbs.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorConsumedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') # drop columns where both sector produced/consumed by in flow allocation dif is null fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # merge the flowamount columns fbs['FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0) # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs['FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy']) # rename flow name to flowable fbs = fbs.rename(columns={"FlowName": 'Flowable', "Compartment": "Context" }) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # add missing data columns fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) # fill null values fbs = fbs.fillna(value=fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names) # aggregate data to every sector level log.info("Aggregating flowbysector to " + method['target_sector_level']) fbs = sector_aggregation(fbs, fbs_default_grouping_fields) # test agg by sector sector_agg_comparison = sector_flow_comparision(fbs) # return sector level specified in method yaml # load the crosswalk linking sector lengths cw = load_sector_length_crosswalk() sector_list = cw[method['target_sector_level']].unique().tolist() # add any non-NAICS sectors used with NAICS household = load_household_sector_codes() household = household.loc[household['NAICS_Level_to_Use_For'] == method['target_sector_level']] # add household sector to sector list sector_list.extend(household['Code'].tolist()) # subset df fbs = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) | (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) # add any missing columns of data and cast to appropriate data type fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbss.append(fbs) # create single df of all activities fbss = pd.concat(fbss, ignore_index=True, sort=False) # aggregate df as activities might have data for the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)