def check_if_sectors_are_naics(df, crosswalk_list, column_headers): """ Check if activity-like sectors are in fact sectors. Also works for the Sector column :return: """ # create a df of non-sectors to export non_sectors_df = [] # create a df of just the non-sectors column non_sectors_list = [] # loop through the df headers and determine if value is not in crosswalk list for c in column_headers: # create df where sectors do not exist in master crosswalk non_sectors = df[~df[c].isin(crosswalk_list)] # drop rows where c is empty non_sectors = non_sectors[non_sectors[c] != ''] # subset to just the sector column if len(non_sectors) != 0: sectors = non_sectors[[c]].rename(columns={c: 'NonSectors'}) non_sectors_df.append(non_sectors) non_sectors_list.append(sectors) if len(non_sectors_df) != 0: # concat the df and the df of sectors # ns_df = pd.concat(non_sectors_df, sort=False, ignore_index=True) ns_list = pd.concat(non_sectors_list, sort=False, ignore_index=True) # print the NonSectors non_sectors = ns_list['NonSectors'].drop_duplicates().tolist() log.info('There are sectors that are not NAICS 2012 Codes') print(non_sectors) else: log.info('All sectors are NAICS 2012 Codes') return non_sectors_df
def getFlowByActivity(flowclass, years, datasource): """ Retrieves stored data in the FlowByActivity format :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or ['Land', 'Other'] :param year: list, a list of years [2015], or [2010,2011,2012] :param datasource: str, the code of the datasource. :return: a pandas DataFrame in FlowByActivity format """ fbas = pd.DataFrame() for y in years: # first try reading parquet from your local repo try: log.info('Loading ' + datasource + ' ' + str(y) +' parquet from local repository') fba = pd.read_parquet(fbaoutputpath + datasource + "_" + str(y) + ".parquet") fba = fba[fba['Class'].isin(flowclass)] fbas = pd.concat([fbas, fba], sort=False) except (OSError, FileNotFoundError): # if parquet does not exist in local repo, read file from Data Commons try: log.info(datasource + ' parquet not found in local repo, loading from Data Commons') fba = pd.read_parquet('https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/' + datasource + "_" + str(y) + '.parquet') fba = fba[fba['Class'].isin(flowclass)] fbas = pd.concat([fbas, fba], sort=False) except FileNotFoundError: log.error("No parquet file found for datasource " + datasource + "and year " + str( y) + " in flowsa or Data Commons") return fbas
def map_fbs_flows(fbs, from_fba_source, v, **kwargs): """ Identifies the mapping file and applies mapping to fbs flows :param fbs: flow-by-sector dataframe :param from_fba_source: str Source name of fba list to look for mappings :param v: dictionary, The datasource parameters :param kwargs: includes keep_unmapped_columns and keep_fba_columns :return fbs_mapped: df, with flows mapped using federal elementary flow list or material flow list :return mapping_files: str, name of mapping file """ ignore_source_name = False if 'mfl_mapping' in v: mapping_files = v['mfl_mapping'] log.info("Mapping flows in %s to material flow list", from_fba_source) flow_type = 'WASTE_FLOW' ignore_source_name = True else: log.info("Mapping flows in %s to federal elementary flow list", from_fba_source) if 'fedefl_mapping' in v: mapping_files = v['fedefl_mapping'] ignore_source_name = True else: mapping_files = from_fba_source flow_type = 'ELEMENTARY_FLOW' fbs_mapped = map_flows(fbs, mapping_files, flow_type, ignore_source_name, **kwargs) return fbs_mapped, mapping_files
def getFlowBySector(methodname): """ Retrieves stored data in the FlowBySector format :param methodname: string, Name of an available method for the given class :return: dataframe in flow by sector format """ fbs = pd.DataFrame() # first try reading parquet from your local repo try: log.info('Loading ' + methodname + ' parquet from local repository') fbs = pd.read_parquet(fbsoutputpath + methodname + ".parquet") except (OSError, FileNotFoundError): # if parquet does not exist in local repo, read file from Data Commons try: log.info( methodname + ' parquet not found in local repo, loading from Data Commons') fbs = pd.read_parquet( 'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowBySector/' + methodname + ".parquet") except FileNotFoundError: log.error("No parquet file found for datasource " + methodname + " in flowsa or Data Commons") return fbs
def getFlowByActivity(flowclass, years, datasource, geographic_level='all', file_location='local'): """ Retrieves stored data in the FlowByActivity format :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or ['Land', 'Other'] :param year: list, a list of years [2015], or [2010,2011,2012] :param datasource: str, the code of the datasource. :param geographic_level: 'all', 'national', 'state', 'county'. Default is 'all' :param file_location: 'local' or 'remote'. Default is 'local' :return: a pandas DataFrame in FlowByActivity format """ fbas = pd.DataFrame() for y in years: # definitions fba_file = datasource + "_" + str(y) + ".parquet" local_file_path = fbaoutputpath + fba_file remote_file_path = 'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/' + fba_file # load data if file_location == 'local': fba = load_file(fba_file, local_file_path, remote_file_path) else: log.info('Loading ' + datasource + ' from remote server') fba = pd.read_parquet(remote_file_path) fba = fba[fba['Class'].isin(flowclass)] # if geographic level specified, only load rows in geo level if geographic_level != 'all': fba = filter_by_geoscale(fba, geographic_level) # concat dfs fbas = pd.concat([fbas, fba], sort=False) return fbas
def extract_facility_data(inventory_dict): import stewi facility_mapping = pd.DataFrame() # load facility data from stewi output directory, keeping only the facility IDs, and geographic information inventory_list = list(inventory_dict.keys()) for i in range(len(inventory_dict)): # define inventory name as inventory type + inventory year (e.g., NEI_2017) database = inventory_list[i] year = list(inventory_dict.values())[i] inventory_name = database + '_' + year facilities = stewi.getInventoryFacilities(database, year) facilities = facilities[['FacilityID', 'State', 'County', 'NAICS']] if len(facilities[facilities.duplicated(subset='FacilityID', keep=False)]) > 0: log.info('Duplicate facilities in ' + inventory_name + ' - keeping first listed') facilities.drop_duplicates(subset='FacilityID', keep='first', inplace=True) facility_mapping = facility_mapping.append(facilities) # Apply FIPS to facility locations facility_mapping = apply_county_FIPS(facility_mapping) return facility_mapping
def check_if_activities_match_sectors(fba): """ Checks if activities in flowbyactivity that appear to be like sectors are actually sectors :param fba: a flow by activity dataset :return: A list of activities not marching the default sector list or text indicating 100% match """ # Get list of activities in a flowbyactivity file activities = [] for f in fba_activity_fields: activities.extend(fba[f]) #activities.remove("None") # Get list of module default sectors flowsa_sector_list = list(load_sector_crosswalk()[sector_source_name]) activities_missing_sectors = set(activities) - set(flowsa_sector_list) if len(activities_missing_sectors) > 0: log.info( str(len(activities_missing_sectors)) + " activities not matching sectors in default " + sector_source_name + " list.") return activities_missing_sectors else: log.info("All activities match sectors in " + sector_source_name + " list.") return None
def return_activity_from_scale(df, provided_from_scale): """ Determine the 'from scale' used for aggregation/df subsetting for each activity combo in a df :param df: flowbyactivity df :param activity_df: a df with the activityproducedby, activityconsumedby columns and a column 'exists' denoting if data is available at specified geoscale :param provided_from_scale: The scale to use specified in method yaml :return: """ # determine the unique combinations of activityproduced/consumedby unique_activities = unique_activity_names(df) # filter by geoscale fips = create_geoscale_list(df, provided_from_scale) df_sub = df[df['Location'].isin(fips)] # determine unique activities after subsetting by geoscale unique_activities_sub = unique_activity_names(df_sub) # return df of the difference between unique_activities and unique_activities2 df_missing = dataframe_difference(unique_activities, unique_activities_sub, which='left_only') # return df of the similarities between unique_activities and unique_activities2 df_existing = dataframe_difference(unique_activities, unique_activities_sub, which='both') df_existing = df_existing.drop(columns='_merge') df_existing['activity_from_scale'] = provided_from_scale # for loop through geoscales until find data for each activity combo if provided_from_scale == 'national': geoscales = ['state', 'county'] elif provided_from_scale == 'state': geoscales = ['county'] elif provided_from_scale == 'county': log.info('No data - skipping') if len(df_missing) > 0: for i in geoscales: # filter by geoscale fips_i = create_geoscale_list(df, i) df_i = df[df['Location'].isin(fips_i)] # determine unique activities after subsetting by geoscale unique_activities_i = unique_activity_names(df_i) # return df of the difference between unique_activities subset and unique_activities for geoscale df_missing_i = dataframe_difference(unique_activities_sub, unique_activities_i, which='right_only') df_missing_i = df_missing_i.drop(columns='_merge') df_missing_i['activity_from_scale'] = i # return df of the similarities between unique_activities and unique_activities2 df_existing_i = dataframe_difference(unique_activities_sub, unique_activities_i, which='both') # append unique activities and df with defined activity_from_scale unique_activities_sub = unique_activities_sub.append(df_missing_i[[fba_activity_fields[0], fba_activity_fields[1]]]) df_existing = df_existing.append(df_missing_i) df_missing = dataframe_difference(df_missing[[fba_activity_fields[0],fba_activity_fields[1]]], df_existing_i[[fba_activity_fields[0],fba_activity_fields[1]]], which=None) return df_existing
def load_map_clean_fba(method, attr, fba_sourcename, df_year, flowclass, geoscale_from, geoscale_to, **kwargs): """ Load, clean, and map a FlowByActivity df :param method: dictionary, FBS method yaml :param attr: dictionary, attribute data from method yaml for activity set :param fba_sourcename: str, source name :param df_year: str, year :param flowclass: str, flowclass to subset df with :param geoscale_from: str, geoscale to use :param geoscale_to: str, geoscale to aggregate to :param kwargs: dictionary, can include parameters: 'allocation_flow', 'allocation_compartment','clean_allocation_fba', 'clean_allocation_fba_w_sec' :return: df, fba format """ log.info("Loading allocation flowbyactivity %s for year %s", fba_sourcename, str(df_year)) fba = load_fba_w_standardized_units(datasource=fba_sourcename, year=df_year, flowclass=flowclass) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the %s level", geoscale_from) check_if_data_exists_at_geoscale(fba, geoscale_from) # aggregate geographically to the scale of the flowbyactivty source, if necessary fba = subset_df_by_geoscale(fba, geoscale_from, geoscale_to) # subset based on yaml settings if 'flowname_subset' in kwargs: if kwargs['flowname_subset'] != 'None': fba = fba.loc[fba['FlowName'].isin(kwargs['flowname_subset'])] if 'compartment_subset' in kwargs: if kwargs['compartment_subset'] != 'None': fba = fba.loc[fba['Compartment'].isin(kwargs['compartment_subset'])] # cleanup the fba allocation df, if necessary if 'clean_fba' in kwargs: log.info("Cleaning %s", fba_sourcename) fba = dynamically_import_fxn(fba_sourcename, kwargs["clean_fba"])(fba, attr=attr) # reset index fba = fba.reset_index(drop=True) # assign sector to allocation dataset log.info("Adding sectors to %s", fba_sourcename) fba_wsec = add_sectors_to_flowbyactivity(fba, sectorsourcename=method['target_sector_source']) # call on fxn to further clean up/disaggregate the fba allocation data, if exists if 'clean_fba_w_sec' in kwargs: log.info("Further disaggregating sectors in %s", fba_sourcename) fba_wsec = dynamically_import_fxn(fba_sourcename, kwargs['clean_fba_w_sec'])(fba_wsec, attr=attr, method=method, sourcename=fba_sourcename) return fba_wsec
def writeFlowBySectorBibliography(methodname): """ Generate bibliography for FlowBySectorMethod in local directory :param methodname: string, FBS methodname for which to create .bib file :return: .bib file save to local directory """ # Generate a single .bib file for a list of Flow-By-Sector method names # and save file to local directory log.info('Write bibliography to %s%s.bib', biboutputpath, methodname) generate_fbs_bibliography(methodname)
def check_allocation_ratios(flow_alloc_df, activity_set, source_name, method_name): """ Check for issues with the flow allocation ratios :param df: :return: """ # create column of sector lengths flow_alloc_df.loc[:, 'slength'] = flow_alloc_df['Sector'].apply( lambda x: len(x)) # subset df flow_alloc_df2 = flow_alloc_df[[ 'FBA_Activity', 'Location', 'slength', 'FlowAmountRatio' ]] # sum the flow amount ratios by location and sector length flow_alloc_df3 = flow_alloc_df2.groupby( ['FBA_Activity', 'Location', 'slength'], as_index=False)[["FlowAmountRatio"]].agg("sum") # not interested in sector length > 6 flow_alloc_df4 = flow_alloc_df3[flow_alloc_df3['slength'] <= 6] ua_count1 = len(flow_alloc_df4[flow_alloc_df4['FlowAmountRatio'] < 1]) log.info( 'There are ' + str(ua_count1) + ' instances at a sector length of 6 or less where the allocation ratio for a location and sector length is < 1' ) ua_count2 = len(flow_alloc_df4[flow_alloc_df4['FlowAmountRatio'] < 0.99]) log.info( 'There are ' + str(ua_count2) + ' instances at a sector length of 6 or less where the allocation ratio for a location and sector length is < 0.99' ) ua_count3 = len(flow_alloc_df4[flow_alloc_df4['FlowAmountRatio'] > 1]) log.info( 'There are ' + str(ua_count3) + ' instances at a sector length of 6 or less where the allocation ratio for a location and sector length is > 1' ) ua_count4 = len(flow_alloc_df4[flow_alloc_df4['FlowAmountRatio'] > 1.01]) log.info( 'There are ' + str(ua_count4) + ' instances at a sector length of 6 or less where the allocation ratio for a location and sector length is > 1.01' ) # save csv to output folder log.info( 'Save the summary table of flow allocation ratios for each sector length for ' + activity_set + ' in output folder') # output data for all sector lengths flow_alloc_df3.to_csv(outputpath + "FlowBySectorMethodAnalysis/" + method_name + '_' + source_name + "_allocation_ratios_" + activity_set + ".csv", index=False) return None
def check_if_location_systems_match(df1, df2): """ Check if two dataframes share the same location system :param df1: fba or fbs df :param df2: fba or fbs df :return: """ if df1["LocationSystem"].all() == df2["LocationSystem"].all(): log.info("LocationSystems match") else: log.warning("LocationSystems do not match, might lose county level data")
def reassign_airplane_emissions(df, year, NAICS_level_value): """ Reassigns emissions from airplanes to NAICS associated with air transportation instead of the NAICS assigned to airports :param df: a dataframe of emissions and mapped faciliites from stewicombo :param year: year as str :param NAICS_level_value: desired NAICS aggregation level, using sector_level_key, should match target_sector_level :return: df """ import stewi from stewicombo.overlaphandler import remove_default_flow_overlaps from stewicombo.globals import addChemicalMatches ## subtract emissions for air transportation from airports in NEI airport_NAICS = '4881' air_transportation_SCC = '2275020000' air_transportation_naics = '481111' log.info('Reassigning emissions from air transportation from airports') # obtain and prepare SCC dataset df_airplanes = stewi.getInventory('NEI', year, stewiformat='flowbyprocess') df_airplanes = df_airplanes[df_airplanes['Process'] == air_transportation_SCC] df_airplanes['Source'] = 'NEI' df_airplanes = addChemicalMatches(df_airplanes) df_airplanes = remove_default_flow_overlaps(df_airplanes, SCC=True) df_airplanes.drop(columns=['Process'], inplace=True) facility_mapping_air = df[['FacilityID', 'NAICS']] facility_mapping_air.drop_duplicates(keep='first', inplace=True) df_airplanes = df_airplanes.merge(facility_mapping_air, how='left', on='FacilityID') df_airplanes['Year'] = year df_airplanes = df_airplanes[ df_airplanes['NAICS'].str[0:len(airport_NAICS)] == airport_NAICS] # subtract airplane emissions from airport NAICS at individual facilities df_planeemissions = df_airplanes[['FacilityID', 'FlowName', 'FlowAmount']] df_planeemissions.rename(columns={'FlowAmount': 'PlaneEmissions'}, inplace=True) df = df.merge(df_planeemissions, how='left', on=['FacilityID', 'FlowName']) df[['PlaneEmissions']] = df[['PlaneEmissions']].fillna(value=0) df['FlowAmount'] = df['FlowAmount'] - df['PlaneEmissions'] df.drop(columns=['PlaneEmissions'], inplace=True) # add airplane emissions under air transport NAICS df_airplanes.loc[:, 'NAICS_lvl'] = air_transportation_naics[0:NAICS_level_value] df = pd.concat([df, df_airplanes], ignore_index=True) return df
def harmonize_FBS_columns(df): """ For FBS use in USEEIOR, harmonize the values in the columns - LocationSystem: drop the year, so just 'FIPS' - MeasureofSpread: tmp set to NoneType as values currently misleading - Spread: tmp set to 0 as values currently misleading - DistributionType: tmp set to NoneType as values currently misleading - MetaSources: Combine strings for rows where class/context/flowtype/flowable/etc. are equal :param df: FBS dataframe with mixed values/strings in columns :return: FBS df with harmonized values/strings in columns """ # harmonize LocationSystem column log.info('Drop year in LocationSystem') if df['LocationSystem'].str.contains('FIPS').all(): df = df.assign(LocationSystem='FIPS') # harmonize MeasureofSpread log.info('Reset MeasureofSpread to NoneType') df = df.assign(MeasureofSpread=None) # reset spread, as current values are misleading log.info('Reset Spread to 0') df = df.assign(Spread=0) # harmonize Distributiontype log.info('Reset DistributionType to NoneType') df = df.assign(DistributionType=None) # harmonize metasources log.info('Harmonize MetaSources') df = replace_NoneType_with_empty_cells(df) # subset all string cols of the df and drop duplicates string_cols = ['Flowable', 'Class', 'SectorProducedBy', 'SectorConsumedBy', 'SectorSourceName', 'Context', 'Location', 'LocationSystem', 'Unit', 'FlowType', 'Year', 'MeasureofSpread', 'MetaSources'] df_sub = df[string_cols].drop_duplicates().reset_index(drop=True) # sort df df_sub = df_sub.sort_values(['MetaSources', 'SectorProducedBy', 'SectorConsumedBy']).reset_index(drop=True) # new group cols group_no_meta = [e for e in string_cols if e not in 'MetaSources'] # combine/sum columns that share the same data other than Metasources, # combining MetaSources string in process df_sub = df_sub.groupby(group_no_meta)['MetaSources'].apply(', '.join).reset_index() # drop the MetaSources col in original df and replace with the MetaSources col in df_sub df = df.drop(columns='MetaSources') harmonized_df = df.merge(df_sub, how='left') harmonized_df = replace_strings_with_NoneType(harmonized_df) return harmonized_df
def check_allocation_ratios(flow_alloc_df_load, activity_set, config): """ Check for issues with the flow allocation ratios :param flow_alloc_df_load: df, includes 'FlowAmountRatio' column :param activity_set: str, activity set :param config: dictionary, method yaml :return: print out information regarding allocation ratios, save csv of results to local directory """ # create column of sector lengths flow_alloc_df =\ flow_alloc_df_load.assign(SectorLength=flow_alloc_df_load['Sector'].str.len()) # subset df flow_alloc_df2 = flow_alloc_df[[ 'FBA_Activity', 'Location', 'SectorLength', 'FlowAmountRatio' ]] # sum the flow amount ratios by location and sector length flow_alloc_df3 = \ flow_alloc_df2.groupby(['FBA_Activity', 'Location', 'SectorLength'], dropna=False, as_index=False).agg({"FlowAmountRatio": sum}) # keep only rows of specified sector length flow_alloc_df4 = flow_alloc_df3[ flow_alloc_df3['SectorLength'] == sector_level_key[ config['target_sector_level']]].reset_index(drop=True) # keep data where the flowamountratio is greater than or less than 1 by 0.005 tolerance = 0.01 flow_alloc_df5 = flow_alloc_df4[ (flow_alloc_df4['FlowAmountRatio'] < 1 - tolerance) | (flow_alloc_df4['FlowAmountRatio'] > 1 + tolerance)] if len(flow_alloc_df5) > 0: vLog.info( 'There are %s instances at a sector length of %s ' 'where the allocation ratio for a location is greater ' 'than or less than 1 by at least %s. See Validation Log', len(flow_alloc_df5), config["target_sector_level"], str(tolerance)) # add to validation log log.info( 'Save the summary table of flow allocation ratios for each sector length for ' '%s in validation log', activity_set) # if df not empty, print, if empty, print string if flow_alloc_df5.empty: vLogDetailed.info('Flow allocation ratios for %s all round to 1', activity_set) else: vLogDetailed.info( 'Flow allocation ratios for %s: ' '\n {}'.format(flow_alloc_df5.to_string()), activity_set)
def function_allocation_method(flow_subset_mapped, k, names, attr, fbs_list): """ Allocate df activities to sectors using a function identified in the FBS method yaml :param flow_subset_mapped: df, FBA with flows converted using fedelemflowlist :param k: str, source name :param names: list, activity names in activity set :param attr: dictionary, attribute data from method yaml for activity set :param fbs_list: list, fbs dfs created running flowbysector.py :return: df, FBS, with allocated activity columns to sectors """ log.info('Calling on function specified in method yaml to allocate ' '%s to sectors', ', '.join(map(str, names))) fbs = dynamically_import_fxn(k, attr['allocation_source'])(flow_subset_mapped, attr, fbs_list) return fbs
def compare_df_units(df1_load, df2_load): """ Determine what units are in each df prior to merge :param df1_load: :param df2_load: :return: """ df1 = df1_load['Unit'].drop_duplicates().tolist() df2 = df2_load['Unit'].drop_duplicates().tolist() # identify differnces between unit lists list_comp = list(set(df1) ^ set(df2)) # if list is not empty, print warning that units are different if list_comp: log.info('Merging df with %s and df with %s units', df1, df2)
def obtain_NAICS_from_facility_matcher(inventory_list): import facilitymatcher ## Access NAICS From facility matcher and assign based on FRS_ID all_NAICS = facilitymatcher.get_FRS_NAICSInfo_for_facility_list( frs_id_list=None, inventories_of_interest_list=inventory_list) all_NAICS = all_NAICS.loc[all_NAICS['PRIMARY_INDICATOR'] == 'PRIMARY'] all_NAICS.drop(columns=['PRIMARY_INDICATOR'], inplace=True) all_NAICS = naics_expansion(all_NAICS) if len(all_NAICS[all_NAICS.duplicated(subset=['FRS_ID', 'Source'], keep=False)]) > 0: log.info('Duplicate primary NAICS reported - keeping first') all_NAICS.drop_duplicates(subset=['FRS_ID', 'Source'], keep='first', inplace=True) return all_NAICS
def acup_call(*, resp, **_): """ Convert response for calling url to pandas dataframe, begin parsing df into FBA format :param resp: df, response from url call :return: pandas dataframe of original source data """ response_json = json.loads(resp.text) # not all states have data, so return empty df if does not exist try: df = pd.DataFrame(data=response_json["data"]) except KeyError: log.info('No data exists for state') df = [] return df
def main(**kwargs): """ Generate FBA parquet(s) :param kwargs: 'source' and 'year' :return: parquet saved to local directory """ # assign arguments if len(kwargs) == 0: kwargs = parse_args() # assign yaml parameters (common.py fxn) config = load_sourceconfig(kwargs['source']) log.info("Creating dataframe list") # @@@01082021JS - Range of years defined, to support split into multiple Parquets: if '-' in str(kwargs['year']): years = str(kwargs['year']).split('-') min_year = int(years[0]) max_year = int(years[1]) + 1 year_iter = list(range(min_year, max_year)) else: # Else only a single year defined, create an array of one: year_iter = [kwargs['year']] for p_year in year_iter: kwargs['year'] = str(p_year) # build the base url with strings that will be replaced build_url = build_url_for_query(config, kwargs) # replace parts of urls with specific instructions from source.py urls = assemble_urls_for_query(build_url, config, kwargs) # create a list with data from all source urls dataframe_list = call_urls(urls, kwargs, config) # concat the dataframes and parse data with specific instructions from source.py log.info("Concat dataframe list and parse data") df = parse_data(dataframe_list, kwargs, config) if isinstance(df, list): for frame in df: if not len(frame.index) == 0: try: source_names = frame['SourceName'] source_name = source_names.iloc[0] except KeyError: source_name = kwargs['source'] process_data_frame(frame, source_name, kwargs['year'], config) else: process_data_frame(df, kwargs['source'], kwargs['year'], config)
def replace_naics_w_naics_2012(df, sectorsourcename): """ Check if activity-like sectors are in fact sectors. Also works for the Sector column :return: """ # test # df = mapping.copy() # drop NoneType df = replace_NoneType_with_empty_cells(df) # load the mastercroswalk and subset by sectorsourcename, save values to list cw_load = load_sector_crosswalk() cw = cw_load[sectorsourcename].drop_duplicates().tolist() # load melted crosswalk cw_melt = melt_naics_crosswalk() # drop the count column cw_melt = cw_melt.drop(columns='naics_count') # determine which headers are in the df possible_column_headers = [ 'Sector', 'SectorProducedBy', 'SectorConsumedBy' ] # # list of column headers that do exist in the df being aggregated column_headers = [ e for e in possible_column_headers if e in df.columns.values.tolist() ] # check if there are any sectors that are not in the naics 2012 crosswalk non_naics2012 = check_if_sectors_are_naics(df, cw, column_headers) # loop through the df headers and determine if value is not in crosswalk list if len(non_naics2012) != 0: log.info( 'Checking if sectors represent a different NAICS year, if so, replace with NAICS 2012' ) for c in column_headers: # merge df with the melted sector crosswalk df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left') # if there is a value in the 'NAICS_2012_Code' column, use that value to replace sector in column c df.loc[df[c] == df['NAICS'], c] = df['NAICS_2012_Code'] # multiply the FlowAmount col by allocation_ratio df.loc[df[c] == df['NAICS_2012_Code'], 'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio'] # drop columns df = df.drop( columns=['NAICS_2012_Code', 'NAICS', 'allocation_ratio']) log.info('Replaced NAICS with NAICS 2012 Codes') # check if there are any sectors that are not in the naics 2012 crosswalk log.info('Check again for non NAICS 2012 Codes') check_if_sectors_are_naics(df, cw, column_headers) else: log.info('No sectors require substitution') return df
def getFlowByActivity(flowclass, years, datasource, geographic_level='all'): """ Retrieves stored data in the FlowByActivity format :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or ['Land', 'Other'] :param year: list, a list of years [2015], or [2010,2011,2012] :param datasource: str, the code of the datasource. :param geographic_level: default set to 'all', which will load all geographic scales in the FlowByActivity, can \ specify 'national', 'state', 'county' :return: a pandas DataFrame in FlowByActivity format """ fbas = pd.DataFrame() for y in years: # first try reading parquet from your local repo try: log.info('Loading ' + datasource + ' ' + str(y) + ' parquet from local repository') fba = pd.read_parquet(fbaoutputpath + datasource + "_" + str(y) + ".parquet") fba = fba[fba['Class'].isin(flowclass)] fbas = pd.concat([fbas, fba], sort=False) except (OSError, FileNotFoundError): # if parquet does not exist in local repo, read file from Data Commons try: log.info( datasource + ' parquet not found in local repo, loading from Data Commons' ) fba = pd.read_parquet( 'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/' + datasource + "_" + str(y) + '.parquet') fba = fba[fba['Class'].isin(flowclass)] fbas = pd.concat([fbas, fba], sort=False) except FileNotFoundError: log.error("No parquet file found for datasource " + datasource + "and year " + str(y) + " in flowsa or Data Commons") # if geographic level specified, only load rows in geo level if geographic_level != 'all': fbas = filter_by_geoscale(fbas, geographic_level) return fbas
def generate_list_of_sources_in_fbs_method(methodname): """ Determine what FlowByActivities are used to generate a FlowBySector :param methodname: string, FlowBySector method :return: list, pairs of FlowByActivity source names and years """ sources = [] # load the fbs method yaml fbs_yaml = load_method(methodname) # create list of data and allocation data sets fbs = fbs_yaml['source_names'] for fbs_k, fbs_v in fbs.items(): try: sources.append([fbs_k, fbs_v['year']]) except KeyError: log.info('Could not append %s to datasource list', fbs_k) continue activities = fbs_v['activity_sets'] for aset, attr in activities.items(): if attr['allocation_source'] != 'None': sources.append([ attr['allocation_source'], attr['allocation_source_year'] ]) if 'helper_source' in attr: sources.append( [attr['helper_source'], attr['helper_source_year']]) if 'literature_sources' in attr: for source, date in attr['literature_sources'].items(): sources.append([source, date]) # load any additional fbas that are called in a fbs method within fxns try: fbas = load_fbs_methods_additional_fbas_config()[methodname] for s, acts_info in fbas.items(): for acts, fxn_info in acts_info.items(): for fxn, fba_info in fxn_info.items(): for fba, y in fba_info.items(): fxn_config = load_functions_loading_fbas_config( )[fxn][fba] sources.append([fxn_config['source'], y]) except KeyError: pass return sources
def check_if_data_exists_for_same_geoscales( fba_wsec_walloc, source, activity): # fba_w_aggregated_sectors """ Determine if data exists at the same scales for datasource and allocation source :param source_fba: :param allocation_fba: :return: """ # todo: modify so only returns warning if no value for entire location, not just no value for one of the possible sectors from flowsa.mapping import get_activitytosector_mapping # create list of highest sector level for which there should be data mapping = get_activitytosector_mapping(source) # filter by activity of interest mapping = mapping.loc[mapping['Activity'].isin(activity)] # add sectors to list sectors_list = pd.unique(mapping['Sector']).tolist() # subset fba w sectors and with merged allocation table so only have rows with aggregated sector list df_subset = fba_wsec_walloc.loc[ (fba_wsec_walloc[fbs_activity_fields[0]].isin(sectors_list)) | (fba_wsec_walloc[fbs_activity_fields[1]].isin(sectors_list) )].reset_index(drop=True) # only interested in total flows # df_subset = df_subset.loc[df_subset['FlowName'] == 'total'].reset_index(drop=True) # df_subset = df_subset.loc[df_subset['Compartment'] == 'total'].reset_index(drop=True) # create subset of fba where the allocation data is missing missing_alloc = df_subset.loc[ df_subset['FlowAmountRatio'].isna()].reset_index(drop=True) # drop any rows where source flow value = 0 missing_alloc = missing_alloc.loc[ missing_alloc['FlowAmount'] != 0].reset_index(drop=True) # create list of locations with missing alllocation data states_missing_data = pd.unique(missing_alloc['Location']).tolist() if len(missing_alloc) == 0: log.info("All aggregated sector flows have allocation flow ratio data") else: log.warning("Missing allocation flow ratio data for " + ', '.join(states_missing_data)) return None
def load_file(datafile, local_file, remote_file): """ Loads a preprocessed file :param datafile: a data file name with any preceeding relative file :param paths: instance of class Paths :return: a pandas dataframe of the datafile """ if os.path.exists(local_file): log.info('Loading ' + datafile + ' from local repository') df = pd.read_parquet(local_file) else: try: log.info( datafile + ' not found in local folder; loading from remote server...') df = pd.read_parquet(remote_file) except FileNotFoundError: log.error("No file found for " + datafile) return df
def getFlowByActivity(datasource, year, flowclass=None, geographic_level=None, download_if_missing=DEFAULT_DOWNLOAD_IF_MISSING): """ Retrieves stored data in the FlowByActivity format :param datasource: str, the code of the datasource. :param year: int, a year, e.g. 2012 :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water' :param geographic_level: str, a geographic level of the data. Optional. E.g. 'national', 'state', 'county'. :param download_if_missing: bool, if True will attempt to load from remote server prior to generating if file not found locally :return: a pandas DataFrame in FlowByActivity format """ from esupy.processed_data_mgmt import download_from_remote # Set fba metadata name = flowsa.flowbyactivity.set_fba_name(datasource, year) fba_meta = set_fb_meta(name, "FlowByActivity") # Try to load a local version of fba; generate and load if missing fba = load_preprocessed_output(fba_meta, paths) # Remote download if fba is None and download_if_missing: log.info('%s %s not found in %s, downloading from remote source', datasource, str(year), fbaoutputpath) download_from_remote(fba_meta, paths) fba = load_preprocessed_output(fba_meta, paths) if fba is None: log.info('%s %s not found in %s, running functions to generate FBA', datasource, str(year), fbaoutputpath) # Generate the fba flowsa.flowbyactivity.main(year=year, source=datasource) # Now load the fba fba = load_preprocessed_output(fba_meta, paths) if fba is None: log.error('getFlowByActivity failed, FBA not found') else: log.info('Loaded %s %s from %s', datasource, str(year), fbaoutputpath) else: log.info('Loaded %s %s from %s', datasource, str(year), fbaoutputpath) # Address optional parameters if flowclass is not None: fba = fba[fba['Class'] == flowclass] # if geographic level specified, only load rows in geo level if geographic_level is not None: fba = filter_by_geoscale(fba, geographic_level) return fba
def getFlowBySector(methodname, file_location='local'): """ Retrieves stored data in the FlowBySector format :param methodname: string, Name of an available method for the given class :param file_location: 'local' or 'remote'. Default is 'local' :return: dataframe in flow by sector format """ # define fbs file fbs_file = methodname + ".parquet" local_file_path = fbsoutputpath + fbs_file remote_file_path = 'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowBySector/' + fbs_file if file_location == 'local': fbs = load_file(fbs_file, local_file_path, remote_file_path) else: log.info('Loading ' + methodname + ' from remote server') fbs = pd.read_parquet(remote_file_path) return fbs
def subset_df_by_geoscale(df, activity_from_scale, activity_to_scale): """ Subset a df by geoscale or agg to create data specified in method yaml :param flow_subset: :param activity_from_scale: :param activity_to_scale: :return: """ # determine 'activity_from_scale' for use in df geoscale subset, by activity modified_from_scale = return_activity_from_scale(df, activity_from_scale) # add 'activity_from_scale' column to df df2 = pd.merge(df, modified_from_scale) # list of unique 'from' geoscales unique_geoscales = modified_from_scale['activity_from_scale'].drop_duplicates().values.tolist() # to scale if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]: to_scale = activity_to_scale else: to_scale = activity_from_scale df_subset_list = [] # subset df based on activity 'from' scale for i in unique_geoscales: df3 = df2[df2['activity_from_scale'] == i] # if desired geoscale doesn't exist, aggregate existing data # if df is less aggregated than allocation df, aggregate fba activity to allocation geoscale if fips_number_key[i] > fips_number_key[to_scale]: log.info("Aggregating subset from " + i + " to " + to_scale) df_sub = agg_by_geoscale(df3, i, to_scale, fba_default_grouping_fields) # else filter relevant rows else: log.info("Subsetting " + i + " data") df_sub = filter_by_geoscale(df3, i) df_subset_list.append(df_sub) df_subset = pd.concat(df_subset_list) return df_subset
def load_source_dataframe(k, v): """ Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector formatted dataframe from another package. :param k: The datasource name :param v: The datasource parameters :return: """ if v['data_format'] == 'FBA': # if yaml specifies a geoscale to load, use parameter to filter dataframe if 'source_fba_load_scale' in v: geo_level = v['source_fba_load_scale'] else: geo_level = 'all' log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows_df = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k, geographic_level=geo_level) elif v['data_format'] == 'FBS': log.info("Retrieving flowbysector for datasource " + k) flows_df = flowsa.getFlowBySector(k) elif v['data_format'] == 'FBS_outside_flowsa': log.info("Retrieving flowbysector for datasource " + k) flows_df = getattr(sys.modules[__name__], v["FBS_datapull_fxn"])(*v['parameters']) else: log.error("Data format not specified in method file for datasource " + k) return flows_df
def process_data_frame(*, df, source, year, config): """ Process the given dataframe, cleaning, converting data, and writing the final parquet. This method was written to move code into a shared method, which was necessary to support the processing of a list of dataframes instead of a single dataframe. :param df: df, FBA format :param source: str, source name :param year: str, year :param config: dict, items in method yaml :return: df, FBA format, standardized """ # log that data was retrieved log.info("Retrieved data for %s %s", source, year) # add any missing columns of data and cast to appropriate data type log.info("Add any missing columns and check field datatypes") flow_df = clean_df(df, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) # sort df and reset index flow_df = flow_df.sort_values(['Class', 'Location', 'ActivityProducedBy', 'ActivityConsumedBy', 'FlowName', 'Compartment']).reset_index(drop=True) # save as parquet file name_data = set_fba_name(source, year) meta = set_fb_meta(name_data, "FlowByActivity") write_df_to_file(flow_df, paths, meta) write_metadata(source, config, meta, "FlowByActivity", year=year) log.info("FBA generated and saved for %s", name_data) # rename the log file saved to local directory rename_log_file(name_data, meta)