def check_if_activities_match_sectors(fba): """ Checks if activities in flowbyactivity that appear to be like sectors are actually sectors :param fba: a flow by activity dataset :return: A list of activities not marching the default sector list or text indicating 100% match """ # Get list of activities in a flowbyactivity file activities = [] for f in fba_activity_fields: activities.extend(fba[f]) #activities.remove("None") # Get list of module default sectors flowsa_sector_list = list(load_sector_crosswalk()[sector_source_name]) activities_missing_sectors = set(activities) - set(flowsa_sector_list) if len(activities_missing_sectors) > 0: log.info( str(len(activities_missing_sectors)) + " activities not matching sectors in default " + sector_source_name + " list.") return activities_missing_sectors else: log.info("All activities match sectors in " + sector_source_name + " list.") return None
def melt_naics_crosswalk(): """ Create a melt version of the naics 07 to 17 crosswalk to map naics to naics 2012 :return: """ # load the mastercroswalk and subset by sectorsourcename, save values to list cw_load = load_sector_crosswalk() # create melt table of possible 2007 and 2017 naics that can be mapped to 2012 cw_melt = cw_load.melt(id_vars='NAICS_2012_Code', var_name='NAICS_year', value_name='NAICS') # drop the naics year because not relevant for replacement purposes cw_replacement = cw_melt.dropna(how='any') cw_replacement = cw_replacement[['NAICS_2012_Code', 'NAICS']].drop_duplicates() # drop rows where contents are equal cw_replacement = cw_replacement[ cw_replacement['NAICS_2012_Code'] != cw_replacement['NAICS']] # drop rows where length > 6 cw_replacement = cw_replacement[cw_replacement['NAICS_2012_Code'].apply( lambda x: len(x) < 7)].reset_index(drop=True) # order by naics 2012 cw_replacement = cw_replacement.sort_values(['NAICS', 'NAICS_2012_Code' ]).reset_index(drop=True) # create allocation ratios by determining number of NAICS 2012 to other naics when not a 1:1 ratio cw_replacement_2 = cw_replacement.assign( naics_count=cw_replacement.groupby( ['NAICS'])['NAICS_2012_Code'].transform('count')) cw_replacement_2 = cw_replacement_2.assign(allocation_ratio=1 / cw_replacement_2['naics_count']) return cw_replacement_2
def expand_naics_list(df, sectorsourcename): # load master crosswalk cw = load_sector_crosswalk() sectors = cw.loc[:, [sectorsourcename]] # Create mapping df that's just the sectors at first sectors = sectors.drop_duplicates().dropna() # fill null values df['Sector'] = df['Sector'].astype('str') naics_df = pd.DataFrame([]) for i in df['Sector']: dig = len(str(i)) n = sectors.loc[sectors[sectorsourcename].apply( lambda x: str(x[0:dig])) == i] n['Sector'] = i naics_df = naics_df.append(n) # merge df to retain activityname/sectortype info naics_expanded = df.merge(naics_df, how='left') # drop column of aggregated naics and rename column of disaggregated naics naics_expanded = naics_expanded.drop(columns=["Sector"]) naics_expanded = naics_expanded.rename( columns={sectorsourcename: 'Sector'}) # drop duplicates and rearrange df columns naics_expanded = naics_expanded.drop_duplicates() naics_expanded = naics_expanded[[ 'ActivitySourceName', 'Activity', 'Sector', 'SectorType' ]] return naics_expanded
def replace_naics_w_naics_2012(df, sectorsourcename): """ Check if activity-like sectors are in fact sectors. Also works for the Sector column :return: """ # test # df = mapping.copy() # drop NoneType df = replace_NoneType_with_empty_cells(df) # load the mastercroswalk and subset by sectorsourcename, save values to list cw_load = load_sector_crosswalk() cw = cw_load[sectorsourcename].drop_duplicates().tolist() # load melted crosswalk cw_melt = melt_naics_crosswalk() # drop the count column cw_melt = cw_melt.drop(columns='naics_count') # determine which headers are in the df possible_column_headers = [ 'Sector', 'SectorProducedBy', 'SectorConsumedBy' ] # # list of column headers that do exist in the df being aggregated column_headers = [ e for e in possible_column_headers if e in df.columns.values.tolist() ] # check if there are any sectors that are not in the naics 2012 crosswalk non_naics2012 = check_if_sectors_are_naics(df, cw, column_headers) # loop through the df headers and determine if value is not in crosswalk list if len(non_naics2012) != 0: log.info( 'Checking if sectors represent a different NAICS year, if so, replace with NAICS 2012' ) for c in column_headers: # merge df with the melted sector crosswalk df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left') # if there is a value in the 'NAICS_2012_Code' column, use that value to replace sector in column c df.loc[df[c] == df['NAICS'], c] = df['NAICS_2012_Code'] # multiply the FlowAmount col by allocation_ratio df.loc[df[c] == df['NAICS_2012_Code'], 'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio'] # drop columns df = df.drop( columns=['NAICS_2012_Code', 'NAICS', 'allocation_ratio']) log.info('Replaced NAICS with NAICS 2012 Codes') # check if there are any sectors that are not in the naics 2012 crosswalk log.info('Check again for non NAICS 2012 Codes') check_if_sectors_are_naics(df, cw, column_headers) else: log.info('No sectors require substitution') return df
def expand_naics_list(df, sectorsourcename): """ Add disaggregated sectors to the crosswalks. :param df: :param sectorsourcename: :return: """ # load master crosswalk cw = load_sector_crosswalk() sectors = cw.loc[:, [sectorsourcename]] # drop duplicates sectors = sectors.drop_duplicates().dropna() # add non-naics to sector list household = load_household_sector_codes() household = pd.DataFrame(household['Code'].drop_duplicates()) household.columns = [sectorsourcename] sectors = sectors.append( household, sort=False).drop_duplicates().reset_index(drop=True) # drop rows that contain hyphenated sectors sectors = sectors[~sectors[sectorsourcename].str. contains("-")].reset_index(drop=True) # Ensure 'None' not added to sectors sectors = sectors[sectors[sectorsourcename] != "None"] # create list of sectors that exist in original df, which, if created when expanding sector list cannot be added existing_sectors = df[['Sector']] existing_sectors = existing_sectors.drop_duplicates() naics_df = pd.DataFrame([]) for i in existing_sectors['Sector']: dig = len(str(i)) n = sectors.loc[sectors[sectorsourcename].apply(lambda x: x[0:dig]) == i] if len(n) != 0: n = n.assign(Sector=i) naics_df = naics_df.append(n) # merge df to retain activityname/sectortype info naics_expanded = df.merge(naics_df, how='left') # drop column of aggregated naics and rename column of disaggregated naics naics_expanded = naics_expanded.drop(columns=["Sector"]) naics_expanded = naics_expanded.rename( columns={sectorsourcename: 'Sector'}) # drop duplicates and rearrange df columns naics_expanded = naics_expanded.drop_duplicates() naics_expanded = naics_expanded[[ 'ActivitySourceName', 'Activity', 'Sector', 'SectorType' ]] return naics_expanded
def check_if_activities_match_sectors(fba): """ Checks if activities in flowbyactivity that appear to be like sectors are actually sectors :param fba: a flow by activity dataset :return: A list of activities not marching the default sector list or text indicating 100% match """ # Get list of activities in a flowbyactivity file activities = [] for f in fba_activity_fields: activities.extend(fba[f]) #activities.remove("None") # Get list of module default sectors flowsa_sector_list = list(load_sector_crosswalk()[SECTOR_SOURCE_NAME]) activities_missing_sectors = set(activities) - set(flowsa_sector_list) if len(activities_missing_sectors) > 0: vLog.debug("%s activities not matching sectors in default %s list", str(len(activities_missing_sectors)), SECTOR_SOURCE_NAME) return activities_missing_sectors
def add_sectors_to_flowbyactivity(flowbyactivity_df, sectorsourcename=sector_source_name): """ Add Sectors from the Activity fields and mapped them to Sector from the crosswalk. No allocation is performed. :param flowbyactivity_df: A standard flowbyactivity data frame :param sectorsourcename: A sector source name, using package default :return: a df with activity fields mapped to 'sectors' """ mappings = [] # First check if source activities are NAICS like - if so make it into a mapping file cat = load_source_catalog() for s in pd.unique(flowbyactivity_df['SourceName']): src_info = cat[s] # read the pre-determined level of sector aggregation of each crosswalk from the source catalog levelofSectoragg = src_info['sector_aggregation_level'] # if data are provided in NAICS format, use the mastercrosswalk if src_info['sector-like_activities']: cw = load_sector_crosswalk() sectors = cw.loc[:, [sector_source_name]] # Create mapping df that's just the sectors at first mapping = sectors.drop_duplicates() # Add the sector twice as activities so mapping is identical mapping = mapping.assign(Activity=sectors[sector_source_name]) mapping = mapping.rename(columns={sector_source_name: "Sector"}) # add columns so can run expand_naics_list_fxn # if sector-like_activities = True, missing columns, so add mapping['ActivitySourceName'] = s # tmp assignment mapping['SectorType'] = None # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated" if levelofSectoragg == 'aggregated': mapping = expand_naics_list(mapping, sectorsourcename) else: # if source data activities are text strings, call on the manually created source crosswalks mapping = get_activitytosector_mapping(s) # filter by SectorSourceName of interest mapping = mapping[mapping['SectorSourceName'] == sectorsourcename] # drop SectorSourceName mapping = mapping.drop(columns=['SectorSourceName']) # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated" if levelofSectoragg == 'aggregated': mapping = expand_naics_list(mapping, sectorsourcename) mappings.append(mapping) mappings_df = pd.concat(mappings, sort=False) # Merge in with flowbyactivity by flowbyactivity_wsector_df = flowbyactivity_df for k, v in activity_fields.items(): sector_direction = k flowbyactivity_field = v[0]["flowbyactivity"] flowbysector_field = v[1]["flowbysector"] sector_type_field = sector_direction + 'SectorType' mappings_df_tmp = mappings_df.rename( columns={ 'Activity': flowbyactivity_field, 'Sector': flowbysector_field, 'SectorType': sector_type_field }) # column doesn't exist for sector-like activities, so ignore if error occurs mappings_df_tmp = mappings_df_tmp.drop(columns=['ActivitySourceName'], errors='ignore') # Merge them in. Critical this is a left merge to preserve all unmapped rows flowbyactivity_wsector_df = pd.merge(flowbyactivity_wsector_df, mappings_df_tmp, how='left', on=flowbyactivity_field) flowbyactivity_wsector_df = flowbyactivity_wsector_df.replace( {np.nan: None}) return flowbyactivity_wsector_df
def add_sectors_to_flowbyactivity(flowbyactivity_df, sectorsourcename=sector_source_name, **kwargs): """ Add Sectors from the Activity fields and mapped them to Sector from the crosswalk. No allocation is performed. :param flowbyactivity_df: A standard flowbyactivity data frame :param sectorsourcename: A sector source name, using package default :param kwargs: option to include the parameter 'allocationmethod', which modifies function behavoir if = 'direct' :return: a df with activity fields mapped to 'sectors' """ # First check if source activities are NAICS like - if so make it into a mapping file cat = load_source_catalog() # for s in pd.unique(flowbyactivity_df['SourceName']): s = pd.unique(flowbyactivity_df['SourceName'])[0] # load catalog info for source src_info = cat[s] # if activities are sector-like, check if need to modify mapping if 'modify_sector-like_activities' in src_info: modify_sector_like_activities = src_info[ 'modify_sector-like_activities'] else: modify_sector_like_activities = False # read the pre-determined level of sector aggregation of each crosswalk from the source catalog levelofSectoragg = src_info['sector_aggregation_level'] # if the FBS activity set is 'direct', overwrite the levelofsectoragg, or if specified in fxn call if kwargs != {}: if 'allocationmethod' in kwargs: if kwargs['allocationmethod'] == 'direct': levelofSectoragg = 'disaggregated' if 'overwrite_sectorlevel' in kwargs: levelofSectoragg = kwargs['overwrite_sectorlevel'] # if data are provided in NAICS format, use the mastercrosswalk if src_info[ 'sector-like_activities'] and modify_sector_like_activities is False: cw = load_sector_crosswalk() sectors = cw.loc[:, [sector_source_name]] # Create mapping df that's just the sectors at first mapping = sectors.drop_duplicates() # Add the sector twice as activities so mapping is identical mapping = mapping.assign(Activity=sectors[sector_source_name]) mapping = mapping.rename(columns={sector_source_name: "Sector"}) # add columns so can run expand_naics_list_fxn # if sector-like_activities = True, missing columns, so add mapping['ActivitySourceName'] = s # tmp assignment mapping['SectorType'] = None # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated" if levelofSectoragg == 'aggregated': mapping = expand_naics_list(mapping, sectorsourcename) else: # if source data activities are text strings, or sector-like activities should be modified, \ # call on the manually created source crosswalks mapping = get_activitytosector_mapping(s) # filter by SectorSourceName of interest mapping = mapping[mapping['SectorSourceName'] == sectorsourcename] # drop SectorSourceName mapping = mapping.drop(columns=['SectorSourceName']) # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated" if levelofSectoragg == 'aggregated': mapping = expand_naics_list(mapping, sectorsourcename) # Merge in with flowbyactivity by flowbyactivity_wsector_df = flowbyactivity_df for k, v in activity_fields.items(): sector_direction = k flowbyactivity_field = v[0]["flowbyactivity"] flowbysector_field = v[1]["flowbysector"] sector_type_field = sector_direction + 'SectorType' mappings_df_tmp = mapping.rename( columns={ 'Activity': flowbyactivity_field, 'Sector': flowbysector_field, 'SectorType': sector_type_field }) # column doesn't exist for sector-like activities, so ignore if error occurs mappings_df_tmp = mappings_df_tmp.drop(columns=['ActivitySourceName'], errors='ignore') # Merge them in. Critical this is a left merge to preserve all unmapped rows flowbyactivity_wsector_df = pd.merge(flowbyactivity_wsector_df, mappings_df_tmp, how='left', on=flowbyactivity_field) flowbyactivity_wsector_df = flowbyactivity_wsector_df.replace( {np.nan: None}) # add sector source name flowbyactivity_wsector_df = flowbyactivity_wsector_df.assign( SectorSourceName=sectorsourcename) # if activities are sector-like check that the sectors are in the crosswalk if src_info['sector-like_activities']: flowbyactivity_wsector_df = replace_naics_w_naics_2012( flowbyactivity_wsector_df, sectorsourcename) return flowbyactivity_wsector_df
def replace_naics_w_naics_from_another_year(df_load, sectorsourcename): """ Replace any non sectors with sectors. :param df_load: df with sector columns or sector-like activities :param sectorsourcename: str, sector source name (ex. NAICS_2012_Code) :return: df, with non-sectors replaced with sectors """ # from flowsa.flowbyfunctions import aggregator # drop NoneType df = replace_NoneType_with_empty_cells(df_load).reset_index(drop=True) # load the mastercroswalk and subset by sectorsourcename, save values to list cw_load = load_sector_crosswalk() cw = cw_load[sectorsourcename].drop_duplicates().tolist() # load melted crosswalk cw_melt = melt_naics_crosswalk() # drop the count column cw_melt = cw_melt.drop(columns='naics_count') # determine which headers are in the df if 'SectorConsumedBy' in df: column_headers = ['SectorProducedBy', 'SectorConsumedBy'] else: column_headers = ['ActivityProducedBy', 'ActivityConsumedBy'] # # list of column headers that do exist in the df being aggregated # column_headers = [e for e in possible_column_headers if e in df.columns.values.tolist()] # check if there are any sectors that are not in the naics 2012 crosswalk non_naics = check_if_sectors_are_naics(df, cw, column_headers) # loop through the df headers and determine if value is not in crosswalk list if len(non_naics) != 0: vLog.debug( 'Checking if sectors represent a different ' 'NAICS year, if so, replace with %s', sectorsourcename) for c in column_headers: # merge df with the melted sector crosswalk df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left') # if there is a value in the sectorsourcename column, # use that value to replace sector in column c if value in # column c is in the non_naics list df[c] = np.where((df[c] == df['NAICS']) & (df[c].isin(non_naics)), df[sectorsourcename], df[c]) # multiply the FlowAmount col by allocation_ratio df.loc[df[c] == df[sectorsourcename], 'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio'] # drop columns df = df.drop( columns=[sectorsourcename, 'NAICS', 'allocation_ratio']) vLog.debug('Replaced NAICS with %s', sectorsourcename) # check if there are any sectors that are not in the naics 2012 crosswalk vLog.debug('Check again for non NAICS 2012 Codes') nonsectors = check_if_sectors_are_naics(df, cw, column_headers) if len(nonsectors) != 0: vLog.debug('Dropping non-NAICS from dataframe') for c in column_headers: # drop rows where column value is in the nonnaics list df = df[~df[c].isin(nonsectors)] # aggregate data possible_column_headers = ('FlowAmount', 'Spread', 'Min', 'Max', 'DataReliability', 'TemporalCorrelation', 'GeographicalCorrelation', 'TechnologicalCorrelation', 'DataCollection', 'Description') # list of column headers to group aggregation by groupby_cols = [ e for e in df.columns.values.tolist() if e not in possible_column_headers ] # groupby_cols = list(df.select_dtypes(include=['object']).columns) df = aggregator(df, groupby_cols) # drop rows where both SectorConsumedBy and SectorProducedBy NoneType if 'SectorConsumedBy' in df: df_drop = df[(df['SectorConsumedBy'].isnull()) & (df['SectorProducedBy'].isnull())] if len(df_drop) != 0: activities_dropped = pd.unique( df_drop[['ActivityConsumedBy', 'ActivityProducedBy']].values.ravel('K')) activities_dropped = list( filter(lambda x: x is not None, activities_dropped)) vLog.debug('Dropping rows where the Activity columns contain %s', ', '.join(activities_dropped)) df = df[~((df['SectorConsumedBy'].isnull()) & (df['SectorProducedBy'].isnull()))].reset_index(drop=True) else: df = df[~((df['ActivityConsumedBy'].isnull()) & (df['ActivityProducedBy'].isnull()))].reset_index(drop=True) df = replace_strings_with_NoneType(df) return df