def load_source_dataframe(k, v): """ Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector formatted dataframe from another package. :param k: The datasource name :param v: The datasource parameters :return: """ if v['data_format'] == 'FBA': # if yaml specifies a geoscale to load, use parameter to filter dataframe if 'source_fba_load_scale' in v: geo_level = v['source_fba_load_scale'] else: geo_level = 'all' log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows_df = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k, geographic_level=geo_level) elif v['data_format'] == 'FBS': log.info("Retrieving flowbysector for datasource " + k) flows_df = flowsa.getFlowBySector(k) elif v['data_format'] == 'FBS_outside_flowsa': log.info("Retrieving flowbysector for datasource " + k) flows_df = getattr(sys.modules[__name__], v["FBS_datapull_fxn"])(*v['parameters']) else: log.error("Data format not specified in method file for datasource " + k) return flows_df
def store_flowbysector(fbs_df, parquet_name): """Prints the data frame into a parquet file.""" f = fbsoutputpath + parquet_name + '.parquet' try: fbs_df.to_parquet(f) except: log.error('Failed to save ' + parquet_name + ' file.')
def getFlowBySector(methodname): """ Retrieves stored data in the FlowBySector format :param methodname: string, Name of an available method for the given class :return: dataframe in flow by sector format """ fbs = pd.DataFrame() # first try reading parquet from your local repo try: log.info('Loading ' + methodname + ' parquet from local repository') fbs = pd.read_parquet(fbsoutputpath + methodname + ".parquet") except (OSError, FileNotFoundError): # if parquet does not exist in local repo, read file from Data Commons try: log.info( methodname + ' parquet not found in local repo, loading from Data Commons') fbs = pd.read_parquet( 'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowBySector/' + methodname + ".parquet") except FileNotFoundError: log.error("No parquet file found for datasource " + methodname + " in flowsa or Data Commons") return fbs
def collapse_activity_fields(df): """ The 'activityconsumedby' and 'activityproducedby' columns from the allocation dataset do not always align with the dataframe being allocated. Generalize the allocation activity column. :param df: df, FBA used to allocate another FBA :return: df, single Activity column """ df = replace_strings_with_NoneType(df) activity_consumed_list = df['ActivityConsumedBy'].drop_duplicates().values.tolist() activity_produced_list = df['ActivityProducedBy'].drop_duplicates().values.tolist() # if an activity field column is all 'none', drop the column and # rename renaming activity columns to generalize if all(v is None for v in activity_consumed_list): df = df.drop(columns=['ActivityConsumedBy', 'SectorConsumedBy']) df = df.rename(columns={'ActivityProducedBy': 'Activity', 'SectorProducedBy': 'Sector'}) elif all(v is None for v in activity_produced_list): df = df.drop(columns=['ActivityProducedBy', 'SectorProducedBy']) df = df.rename(columns={'ActivityConsumedBy': 'Activity', 'SectorConsumedBy': 'Sector'}) else: log.error('Cannot generalize dataframe') # drop other columns df = df.drop(columns=['ProducedBySectorType', 'ConsumedBySectorType']) return df
def getFlowByActivity(flowclass, years, datasource): """ Retrieves stored data in the FlowByActivity format :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or ['Land', 'Other'] :param year: list, a list of years [2015], or [2010,2011,2012] :param datasource: str, the code of the datasource. :return: a pandas DataFrame in FlowByActivity format """ fbas = pd.DataFrame() for y in years: # first try reading parquet from your local repo try: log.info('Loading ' + datasource + ' ' + str(y) +' parquet from local repository') fba = pd.read_parquet(fbaoutputpath + datasource + "_" + str(y) + ".parquet") fba = fba[fba['Class'].isin(flowclass)] fbas = pd.concat([fbas, fba], sort=False) except (OSError, FileNotFoundError): # if parquet does not exist in local repo, read file from Data Commons try: log.info(datasource + ' parquet not found in local repo, loading from Data Commons') fba = pd.read_parquet('https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/' + datasource + "_" + str(y) + '.parquet') fba = fba[fba['Class'].isin(flowclass)] fbas = pd.concat([fbas, fba], sort=False) except FileNotFoundError: log.error("No parquet file found for datasource " + datasource + "and year " + str( y) + " in flowsa or Data Commons") return fbas
def getFlowByActivity(datasource, year, flowclass=None, geographic_level=None, download_if_missing=DEFAULT_DOWNLOAD_IF_MISSING): """ Retrieves stored data in the FlowByActivity format :param datasource: str, the code of the datasource. :param year: int, a year, e.g. 2012 :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water' :param geographic_level: str, a geographic level of the data. Optional. E.g. 'national', 'state', 'county'. :param download_if_missing: bool, if True will attempt to load from remote server prior to generating if file not found locally :return: a pandas DataFrame in FlowByActivity format """ from esupy.processed_data_mgmt import download_from_remote # Set fba metadata name = flowsa.flowbyactivity.set_fba_name(datasource, year) fba_meta = set_fb_meta(name, "FlowByActivity") # Try to load a local version of fba; generate and load if missing fba = load_preprocessed_output(fba_meta, paths) # Remote download if fba is None and download_if_missing: log.info('%s %s not found in %s, downloading from remote source', datasource, str(year), fbaoutputpath) download_from_remote(fba_meta, paths) fba = load_preprocessed_output(fba_meta, paths) if fba is None: log.info('%s %s not found in %s, running functions to generate FBA', datasource, str(year), fbaoutputpath) # Generate the fba flowsa.flowbyactivity.main(year=year, source=datasource) # Now load the fba fba = load_preprocessed_output(fba_meta, paths) if fba is None: log.error('getFlowByActivity failed, FBA not found') else: log.info('Loaded %s %s from %s', datasource, str(year), fbaoutputpath) else: log.info('Loaded %s %s from %s', datasource, str(year), fbaoutputpath) # Address optional parameters if flowclass is not None: fba = fba[fba['Class'] == flowclass] # if geographic level specified, only load rows in geo level if geographic_level is not None: fba = filter_by_geoscale(fba, geographic_level) return fba
def load_method(method_name): """ Loads a flowbysector method from a YAML :param method_name: str, FBS method name (ex. 'Water_national_m1_2015') :return: dictionary, items in the FBS method yaml """ sfile = flowbysectormethodpath + method_name + '.yaml' try: with open(sfile, 'r') as f: method = yaml.safe_load(f) except IOError: log.error("FlowBySector method file not found.") return method
def getFlowBySector(methodname): """ Retrieves stored data in the FlowBySector format :param methodname: string, Name of an available method for the given class :return: dataframe in flow by sector format """ fbs = pd.DataFrame() try: fbs = pd.read_parquet(fbsoutputpath + methodname + ".parquet") except FileNotFoundError: log.error("No parquet file found for datasource " + methodname + " in flowsa") return fbs
def load_method(method_name): """ Loads a flowbysector method from a YAML :param method_name: :return: """ sfile = flowbysectormethodpath + method_name + '.yaml' try: with open(sfile, 'r') as f: method = yaml.safe_load(f) except IOError: log.error("FlowBySector method file not found.") return method
def filter_by_geoscale(df, geoscale): """ Filter flowbyactivity by FIPS at the given scale :param df: Either flowbyactivity or flowbysector :param geoscale: string, either 'national', 'state', or 'county' :return: filtered flowbyactivity or flowbysector """ fips = create_geoscale_list(df, geoscale) df = df[df['Location'].isin(fips)] if len(df) == 0: log.error("No flows found in the " + " flow dataset at the " + geoscale + " scale.") else: return df
def getFlowByActivity(flowclass, years, datasource, geographic_level='all'): """ Retrieves stored data in the FlowByActivity format :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or ['Land', 'Other'] :param year: list, a list of years [2015], or [2010,2011,2012] :param datasource: str, the code of the datasource. :param geographic_level: default set to 'all', which will load all geographic scales in the FlowByActivity, can \ specify 'national', 'state', 'county' :return: a pandas DataFrame in FlowByActivity format """ fbas = pd.DataFrame() for y in years: # first try reading parquet from your local repo try: log.info('Loading ' + datasource + ' ' + str(y) + ' parquet from local repository') fba = pd.read_parquet(fbaoutputpath + datasource + "_" + str(y) + ".parquet") fba = fba[fba['Class'].isin(flowclass)] fbas = pd.concat([fbas, fba], sort=False) except (OSError, FileNotFoundError): # if parquet does not exist in local repo, read file from Data Commons try: log.info( datasource + ' parquet not found in local repo, loading from Data Commons' ) fba = pd.read_parquet( 'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/' + datasource + "_" + str(y) + '.parquet') fba = fba[fba['Class'].isin(flowclass)] fbas = pd.concat([fbas, fba], sort=False) except FileNotFoundError: log.error("No parquet file found for datasource " + datasource + "and year " + str(y) + " in flowsa or Data Commons") # if geographic level specified, only load rows in geo level if geographic_level != 'all': fbas = filter_by_geoscale(fbas, geographic_level) return fbas
def load_file(datafile, local_file, remote_file): """ Loads a preprocessed file :param datafile: a data file name with any preceeding relative file :param paths: instance of class Paths :return: a pandas dataframe of the datafile """ if os.path.exists(local_file): log.info('Loading ' + datafile + ' from local repository') df = pd.read_parquet(local_file) else: try: log.info( datafile + ' not found in local folder; loading from remote server...') df = pd.read_parquet(remote_file) except FileNotFoundError: log.error("No file found for " + datafile) return df
def getFlowByActivity(flowclass, years, datasource): """ Retrieves stored data in the FlowByActivity format :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or ['Land', 'Other'] :param year: list, a list of years [2015], or [2010,2011,2012] :param datasource: str, the code of the datasource. :return: a pandas DataFrame in FlowByActivity format """ fbas = pd.DataFrame() for y in years: try: fba = pd.read_parquet(fbaoutputpath + datasource + "_" + str(y) + ".parquet") fba = fba[fba['Class'].isin(flowclass)] fbas = pd.concat([fbas, fba], sort=False) except FileNotFoundError: log.error("No parquet file found for datasource " + datasource + "and year " + str(y) + " in flowsa") return fbas
def load_source_dataframe(k, v): """ Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector formatted dataframe from another package. :param k: The datasource name :param v: The datasource parameters :return: """ if v['data_format'] == 'FBA': log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows_df = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k) elif v['data_format'] == 'FBS': log.info("Retrieving flowbysector for datasource " + k) flows_df = flowsa.getFlowBySector(k) elif v['data_format'] == 'FBS_outside_flowsa': log.info("Retrieving flowbysector for datasource " + k) flows_df = getattr(sys.modules[__name__], v["FBS_datapull_fxn"])(v['parameters']) else: log.error("No parquet file found for datasource " + k) return flows_df
def getFlowBySector(methodname, download_if_missing=DEFAULT_DOWNLOAD_IF_MISSING): """ Loads stored FlowBySector output or generates it if it doesn't exist, then loads :param methodname: string, Name of an available method for the given class :param download_if_missing: bool, if True will attempt to load from remote server prior to generating if file not found locally :return: dataframe in flow by sector format """ from esupy.processed_data_mgmt import download_from_remote fbs_meta = set_fb_meta(methodname, "FlowBySector") fbs = load_preprocessed_output(fbs_meta, paths) # Remote download if fbs is None and download_if_missing: log.info('%s not found in %s, downloading from remote source', methodname, fbsoutputpath) # download and load the FBS parquet subdirectory_dict = {'.log': 'Log'} download_from_remote(fbs_meta, paths, subdirectory_dict=subdirectory_dict) fbs = load_preprocessed_output(fbs_meta, paths) # If remote download not specified and no FBS, generate the FBS if fbs is None: log.info('%s not found in %s, running functions to generate FBS', methodname, fbsoutputpath) # Generate the fba flowsa.flowbysector.main(method=methodname) # Now load the fba fbs = load_preprocessed_output(fbs_meta, paths) if fbs is None: log.error('getFlowBySector failed, FBS not found') else: log.info('Loaded %s from %s', methodname, fbsoutputpath) else: log.info('Loaded %s from %s', methodname, fbsoutputpath) return fbs
def blm_pls_call(**kwargs): """ Convert response for calling url to pandas dataframe, begin parsing df into FBA format :param kwargs: potential arguments include: url: string, url response_load: df, response from url call args: dictionary, arguments specified when running flowbyactivity.py ('year' and 'source') :return: pandas dataframe of original source data """ # load arguments necessary for function response_load = kwargs['r'] args = kwargs['args'] df = pd.DataFrame() sub_headers = {} skip = False last_row_header = "" next_line = False copy = False location_str = [] flow_value = [] flow_name = [] number_of_sub_headers = 0 duplicate_headers = [ "Pre-Reform Act Future Interest Leases", "Reform Act Leases", "Reform Act Future Interest Leases" ] if args["year"] == "2007": sub_headers = { "Oil and Gas Pre-Reform Act Leases": { "Public Domain": [99], "Acquired Lands": [99] }, "Pre-Reform Act Future Interest Leases": { "Public Domain & Acquired Lands": [100, 109, 110] }, "Reform Act Leases": { "Public Domain": [101, 110], "Acquired Lands": [101, 102] }, "Reform Act Leases—continued": { "Acquired Lands": [111] }, "Reform Act Future Interest Leases": { "Public Domain & Acquired Lands": [103], "Acquired Lands": [112] }, "Competitive General Services Administration (GSA) Oil & Gas Leases": { "Public Domain": [103] }, "Competitive Protective Leases": { "Public Domain & Acquired Lands": [103] }, "Competitive National Petroleum Reserve—Alaska Leases": { "Public Domain": [104] }, "Competitive Naval Oil Shale Reserve Leases": { "Public Domain": [104] }, "Pre-EPAct Competitive Geothermal Leases": { "Public Domain & Acquired Lands": [104] }, "EPAct Competitive Geothermal Leases": { "Public Domain & Acquired Lands": [104] }, "Oil and Gas Pre-Reform Act Over-the-Counter Leases": { "Public Domain": [106], "Acquired Lands": [106, 107] }, "Pre-Reform Act Simultaneous Leases": { "Acquired Lands": [108, 109] }, "Summary: Pre-Reform Act Simultaneous Leases": { "Public Domain & Acquired Lands": [109] }, "Geothermal Leases": { "Public Domain & Acquired Lands": [112] }, "Private Leases": { "Acquired Lands": [114] }, "Exchange Leases": { "Public Domain": [114] }, "Renewal Leases": { "Public Domain": [114] }, "Class III Reinstatement Leases": { "Public Domain": [115] }, "Oil and Gas Special Act – Rights-of-Way of 1930": { "Public Domain": [115] }, "Oil and Gas Special Act – Federal Farm Mortgage Corporation Act of 1934": { "Acquired Lands": [115] }, "Oil and Gas Special Act – Texas Relinquishment Act of 1919": { "Acquired Lands": [115] }, "Federal Coal Leases": { "Competitive Nonregional Lease-by-Application Leases": [122], "Competitive Pre-Federal Coal Leasing" "Amendment Act (FCLAA) Leases": [122], "Competitive Regional Emergency/Bypass Leases": [122], "Competitive Regional Leases": [123], "Exchange Leases": [123], "Preference Right Leases": [123] }, "Coal Licenses": { "Exploration Licenses": [124], "Licenses to Mine": [124] }, "Logical Mining Units": { "None": [124] }, "Combined Hydrocarbon Leases": { "None": [126] }, "Phosphate Leases": { "Phosphate Competitive Leases": [126], "Phosphate Fringe Acreage Noncompetitive Leases": [126], "Phosphate Preference Right Leases": [126] }, "Phosphate Use Permits": { "None": [127] }, "Sodium Leases": { "Sodium Competitive Leases": [127], "Sodium Fringe Acreage Noncompetitive Leases": [127], "Sodium Preference Right Leases": [127] }, "Sodium Use Permit": { "None": [127] }, "Potassium Leases": { "Potassium Competitive Leases": [128], "Potassium Fringe Acreage Noncompetitive Leases": [128], "Potassium Preference Right Leases": [128] }, "Gilsonite Leases": { "Gilsonite Competitive Leases": [128], "Gilsonite Fringe Acreage Noncompetitive Lease": [129], "Gilsonite Preference Right Leases": [129] }, "Oil Shale Leases": { "Oil Shale R, D&D Leases": [129] }, "Hardrock – Acquired Lands Leases": { "Hardrock Preference Right Leases": [130] }, "Asphalt Competitive Leases": { "None": [130] } } competitive_page_numbers = [100, 101, 102] no_header_page_numbers = [123, 129] elif args["year"] == "2011": sub_headers = { "Oil and Gas Pre-Reform Act Leases": { "Public Domain": [111], "Acquired Lands": [111, 112] }, "Pre-Reform Act Future Interest Leases": { "Public Domain and Acquired Lands": [113, 122] }, "Reform Act Leases": { "Public Domain": [113, 123], "Acquired Lands": [123, 124] }, "Reform Act Leases—continued": { "Acquired Lands": [114] }, "Competitive General Services Administration (GSA) Oil and Gas Leases": { "Public Domain": [116] }, "Competitive Protective Leases": { "Public Domain and Acquired Lands": [116] }, "Competitive National Petroleum Reserve—Alaska Leases": { "Public Domain": [116] }, "Competitive Naval Oil Shale Reserve Leases": { "Public Domain": [116] }, "Pre-EPAct Competitive Geothermal Leases": { "Public Domain and Acquired Lands": [117] }, "EPAct Competitive Geothermal Leases": { "Public Domain and Acquired Lands": [117] }, "Oil and Gas Pre-Reform Act Over-the-Counter Leases": { "Public Domain": [119], "Acquired Lands": [119] }, "Pre-Reform Act Simultaneous Leases—continued": { "Acquired Lands": [120, 121] }, "Summary: Pre-Reform Act Simultaneous Leases": { "Public Domain and Acquired Lands": [122] }, "Reform Act Future Interest Leases": { "Acquired Lands": [125] }, "Geothermal Leases": { "Public Domain and Acquired Lands": [125] }, "Private Leases": { "Acquired Lands": [126] }, "Exchange Leases": { "Public Domain": [126] }, "Renewal Leases": { "Public Domain": [126, 127] }, "Class III Reinstatement Leases": { "Public Domain": [127] }, "Oil and Gas Special Act – Rights-of-Way of 1930": { "Public Domain": [127, 128] }, "Oil and Gas Special Act – Federal Farm Mortgage Corporation Act of 1934": { "Acquired Lands": [128] }, "Oil and Gas Special Act – Texas Relinquishment Act of 1919": { "Acquired Lands": [128] }, "Federal Coal Leases": { "Competitive Nonregional Lease-by-Application Leases": [135], "Competitive Pre-Federal Coal Leasing Amendment Act (FCLAA) Leases": [135], "Competitive Regional Emergency/Bypass Leases": [135], "Competitive Regional Leases": [136], "Exchange Leases": [136], "Preference Right Leases": [136] }, "Coal Licenses": { "Exploration Licenses": [137], "Licenses To Mine": [137] }, "Logical Mining Units": { "None": [137] }, "Combined Hydrocarbon Leases": { "None": [139] }, "Phosphate Leases": { "Phosphate Competitive Leases": [139], "Phosphate Fringe Acreage Noncompetitive Leases": [139], "Phosphate Preference Right Leases": [139] }, "Phosphate Use Permits": { "None": [139] }, "Sodium Leases": { "Sodium Competitive Leases": [140], "Sodium Fringe Acreage Noncompetitive Leases": [140], "Sodium Preference Right Leases": [140] }, "Sodium Use Permit": { "None": [140] }, "Potassium Leases": { "Potassium Competitive Leases": [141], "Potassium Fringe Acreage Noncompetitive Leases": [141], "Potassium Preference Right Leases": [141] }, "Gilsonite Leases": { "Gilsonite Competitive Leases": [142], "Gilsonite Fringe Acreage Noncompetitive Leases": [142], "Gilsonite Preference Right Leases": [142] }, "Oil Shale RD&D Leases": { "None": [142] }, "Hardrock – Acquired Lands Leases": { "Hardrock Preference Right Leases": [143] } } competitive_page_numbers = [113, 114] no_header_page_numbers = [136] elif args["year"] == "2012": sub_headers = { "Oil and Gas Pre-Reform Act Leases": { "Public Domain": [108], "Acquired Lands": [108, 109] }, "Pre-Reform Act Future Interest Leases": { "Public Domain and Acquired Lands": [110, 119] }, "Reform Act Leases": { "Public Domain": [110, 120], "Acquired Lands": [110] }, "Reform Act Leases—continued": { "Acquired Lands": [111] }, "Competitive General Services Administration (GSA) Oil and Gas Leases": { "Public Domain": [113] }, "Competitive Protective Leases": { "Public Domain and Acquired Lands": [113] }, "Competitive National Petroleum Reserve—Alaska Leases": { "Public Domain": [113] }, "Competitive Naval Oil Shale Reserve Leases": { "Public Domain": [113] }, "Pre-EPAct Competitive Geothermal Leases": { "Public Domain and Acquired Lands": [114] }, "EPAct Competitive Geothermal Leases": { "Public Domain and Acquired Lands": [114] }, "Oil and Gas Pre-Reform Act Over-the-Counter Leases": { "Public Domain": [116], "Acquired Lands": [116] }, "Pre-Reform Act Simultaneous Leases": { "Public Domain": [117] }, "Pre-Reform Act Simultaneous Leases—continued": { "Public Domain": [118], "Acquired Lands": [118] }, "Summary: Pre-Reform Act Simultaneous Leases": { "Public Domain and Acquired Lands": [119] }, "Reform Act Future Interest Leases": { "Acquired Lands": [122] }, "Geothermal Leases": { "Public Domain and Acquired Lands": [122] }, "Private Leases": { "Acquired Lands": [124] }, "Exchange Leases": { "Public Domain": [124] }, "Renewal Leases": { "Public Domain": [124, 125] }, "Class III Reinstatement Leases": { "Public Domain": [125] }, "Oil and Gas Special Act – Rights-of-Way of 1930": { "Public Domain": [125, 126] }, "Oil and Gas Special Act – Federal Farm Mortgage Corporation Act of 1934": { "Acquired Lands": [126] }, "Oil and Gas Special Act – Texas Relinquishment Act of 1919": { "Acquired Lands": [126] }, "Federal Coal Leases": { "Competitive Nonregional Lease-by-Application Leases": [133], "Competitive Pre-Federal Coal Leasing Amendment Act (FCLAA) Leases": [133], "Competitive Regional Emergency/Bypass Leases": [133], "Competitive Regional Leases": [134], "Exchange Leases": [134], "Preference Right Leases": [134] }, "Coal Licenses": { "Exploration Licenses": [135], "Licenses To Mine": [135] }, "Logical Mining Units": { "None": [135] }, "Combined Hydrocarbon Leases": { "None": [137] }, "Phosphate Leases": { "Phosphate Competitive Leases": [137], "Phosphate Fringe Acreage Noncompetitive Leases": [137], "Phosphate Preference Right Leases": [137] }, "Phosphate Use Permits": { "None": [137] }, "Sodium Leases": { "Sodium Competitive Leases": [138], "Sodium Fringe Acreage Noncompetitive Leases": [138], "Sodium Preference Right Leases": [138] }, "Sodium Use Permit": { "None": [138] }, "Potassium Leases": { "Potassium Competitive Leases": [139], "Potassium Fringe Acreage Noncompetitive Leases": [139], "Potassium Preference Right Leases": [139] }, "Gilsonite Leases": { "Gilsonite Competitive Leases": [140], "Gilsonite Fringe Acreage Noncompetitive Leases": [140], "Gilsonite Preference Right Leases": [140] }, "Oil Shale RD&D Leases": { "None": [140] }, "Hardrock – Acquired Lands Leases": { "Hardrock Preference Right Leases": [141] } } competitive_page_numbers = [110, 111] no_header_page_numbers = [134] else: # provide reasoning for failure of parsing data log.error( 'Missing code specifying sub-headers, add code to blm_pls_call()') for header in sub_headers: for sub_header in sub_headers[header]: pg = sub_headers[header][sub_header] pdf_pages = [] for page_number in pg: found_header = False pdf_page = \ tabula.read_pdf(io.BytesIO(response_load.content), pages=page_number, stream=True, guess=False, )[0] if pdf_page.shape[1] == 1: pdf_page.columns = ["one"] else: pdf_page.columns = ["one", "two"] pdf_page.dropna(subset=["one"], inplace=True) # add col of page number pdf_page['page_no'] = page_number pdf_pages.append(pdf_page) for page in pdf_pages: for index, row in page.iterrows(): if " /" in row["one"]: split_header = row["one"].split(" /") split_row = split_header[0].strip() else: split_row = row["one"] # if page_number in no_header_page_numbers: if row['page_no'] in no_header_page_numbers: # if pages in no_header_page_numbers: found_header = True if split_row == header: found_header = True last_row_header = header if split_row == sub_header and last_row_header == header: copy = True elif sub_header == "None" and last_row_header == header: copy = True if copy and split_row != sub_header and split_row != header and found_header: if "FISCAL" in row["one"] or row["one"].isdigit(): skip = True if not skip: if sub_header == "None": sub_header = "" lists = split(row, header, sub_header, next_line) if header in duplicate_headers: # if page_number in competitive_page_numbers: if row['page_no'] in competitive_page_numbers: flow_name.append("Competitive " + lists[1]) else: flow_name.append("Noncompetitive " + lists[1]) else: flow_name.append(lists[1]) location_str.append(lists[0]) flow_value.append(lists[2]) if next_line: copy = False next_line = False header = "Nothing" if "Total" in row["one"]: row_one_str = "" if any(i.isdigit() for i in row["one"]): # row split based on space row_one_split = row["one"].split(" ") for r in row_one_split: if not any(d.isdigit() for d in r): row_one_str = row_one_str + " " + r else: row_one_str = row["one"] if pdf_page.shape[1] == 1 and row[ "one"] == "Total": next_line = True elif row_one_str.strip() == "Total" or "Leases" \ in row["one"] or "None" in row["one"]: number_of_sub_headers = number_of_sub_headers + 1 copy = False found_header = False # if number_of_sub_headers >= len(sub_headers[item]): # header = "Nothing" else: next_line = True # if "Total" in row["one"]: # copy = False # found_header = False if sub_header + "—continued" in row["one"]: skip = False df["LocationStr"] = location_str df["ActivityConsumedBy"] = flow_name df["FlowAmount"] = flow_value return df
def get_fba_allocation_subset(fba_allocation, source, activitynames, **kwargs): """ Subset the fba allocation data based on NAICS associated with activity :param fba_allocation: df, FBA format :param source: str, source name :param activitynames: list, activity names in activity set :param kwargs: can be the mapping file and method of allocation :return: df, FBA subset """ # first determine if there are special cases that would modify the typical method of subset # an example of a special case is when the allocation method is 'proportional-flagged' subset_by_sector_cols = False subset_by_column_value = False if kwargs != {}: if 'flowSubsetMapped' in kwargs: fsm = kwargs['flowSubsetMapped'] if 'allocMethod' in kwargs: am = kwargs['allocMethod'] if am == 'proportional-flagged': subset_by_sector_cols = True if 'activity_set_names' in kwargs: asn = kwargs['activity_set_names'] if asn is not None: if 'allocation_subset_col' in asn: subset_by_column_value = True # load the source catalog cat = load_source_catalog() src_info = cat[source] if src_info['sector-like_activities'] is False: # read in source crosswalk df = get_activitytosector_mapping(source) sec_source_name = df['SectorSourceName'][0] df = expand_naics_list(df, sec_source_name) # subset source crosswalk to only contain values pertaining to list of activity names df = df.loc[df['Activity'].isin(activitynames)] # turn column of sectors related to activity names into list sector_list = pd.unique(df['Sector']).tolist() # subset fba allocation table to the values in # the activity list, based on overlapping sectors if 'Sector' in fba_allocation: fba_allocation_subset =\ fba_allocation.loc[fba_allocation['Sector'].isin( sector_list)].reset_index(drop=True) else: fba_allocation_subset = \ fba_allocation.loc[(fba_allocation[fbs_activity_fields[0]].isin(sector_list)) | (fba_allocation[fbs_activity_fields[1]].isin(sector_list))]. \ reset_index(drop=True) else: if 'Sector' in fba_allocation: fba_allocation_subset =\ fba_allocation.loc[fba_allocation['Sector'].isin( activitynames)].reset_index(drop=True) elif subset_by_sector_cols: # if it is a special case, then base the subset of data on # sectors in the sector columns, not on activitynames fsm_sub = fsm.loc[ (fsm[fba_activity_fields[0]].isin(activitynames)) | (fsm[fba_activity_fields[1]].isin(activitynames))].reset_index( drop=True) part1 = fsm_sub[['SectorConsumedBy']] part2 = fsm_sub[['SectorProducedBy']] part1.columns = ['Sector'] part2.columns = ['Sector'] modified_activitynames = pd.concat( [part1, part2], ignore_index=True).drop_duplicates() modified_activitynames =\ modified_activitynames[modified_activitynames['Sector'].notnull()] modified_activitynames = modified_activitynames['Sector'].tolist() fba_allocation_subset = \ fba_allocation.loc[ (fba_allocation[fbs_activity_fields[0]].isin(modified_activitynames)) | (fba_allocation[fbs_activity_fields[1]].isin(modified_activitynames))]. \ reset_index(drop=True) else: fba_allocation_subset =\ fba_allocation.loc[(fba_allocation[fbs_activity_fields[0]].isin(activitynames)) | (fba_allocation[fbs_activity_fields[1]].isin(activitynames))].\ reset_index(drop=True) # if activity set names included in function call and activity set names is not null, \ # then subset data based on value and column specified if subset_by_column_value: # create subset of activity names and allocation subset metrics asn_subset = asn[asn['name'].isin(activitynames)].reset_index( drop=True) if asn_subset['allocation_subset'].isna().all(): pass elif asn_subset['allocation_subset'].isna().any(): log.error( 'Define column and value to subset on in the activity set csv for all rows' ) else: col_to_subset = asn_subset['allocation_subset_col'][0] val_to_subset = asn_subset['allocation_subset'][0] # subset fba_allocation_subset further log.debug('Subset the allocation dataset where %s = %s', str(col_to_subset), str(val_to_subset)) fba_allocation_subset = fba_allocation_subset[ fba_allocation_subset[col_to_subset] == val_to_subset].reset_index(drop=True) return fba_allocation_subset
def allocation_helper(df_w_sector, attr, method, v): """ Function to help allocate activity names using secondary df :param df_w_sector: df, includes sector columns :param attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :param v: dictionary, the datasource parameters :return: df, with modified fba allocation values """ from flowsa.validation import compare_df_units # add parameters to dictionary if exist in method yaml fba_dict = {} if 'helper_flow' in attr: fba_dict['flowname_subset'] = attr['helper_flow'] if 'clean_helper_fba' in attr: fba_dict['clean_fba'] = attr['clean_helper_fba'] if 'clean_helper_fba_wsec' in attr: fba_dict['clean_fba_w_sec'] = attr['clean_helper_fba_wsec'] # load the allocation FBA helper_allocation = load_map_clean_fba(method, attr, fba_sourcename=attr['helper_source'], df_year=attr['helper_source_year'], flowclass=attr['helper_source_class'], geoscale_from=attr['helper_from_scale'], geoscale_to=v['geoscale_to_use'], **fba_dict) # run sector disagg to capture any missing lower level naics helper_allocation = sector_disaggregation(helper_allocation) # generalize activity field names to enable link to water withdrawal table helper_allocation = collapse_activity_fields(helper_allocation) # drop any rows not mapped helper_allocation = helper_allocation[helper_allocation['Sector'].notnull()] # drop columns helper_allocation = helper_allocation.drop(columns=['Activity', 'Min', 'Max']) # rename column helper_allocation = helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'}) # determine the df_w_sector column to merge on df_w_sector = replace_strings_with_NoneType(df_w_sector) sec_consumed_list = df_w_sector['SectorConsumedBy'].drop_duplicates().values.tolist() sec_produced_list = df_w_sector['SectorProducedBy'].drop_duplicates().values.tolist() # if a sector field column is not all 'none', that is the column to merge if all(v is None for v in sec_consumed_list): sector_col_to_merge = 'SectorProducedBy' elif all(v is None for v in sec_produced_list): sector_col_to_merge = 'SectorConsumedBy' else: log.error('There is not a clear sector column to base merge with helper allocation dataset') # merge allocation df with helper df based on sectors, depending on geo scales of dfs if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'county'): helper_allocation.loc[:, 'Location_tmp'] = \ helper_allocation['Location'].apply(lambda x: x[0:2]) df_w_sector.loc[:, 'Location_tmp'] = df_w_sector['Location'].apply(lambda x: x[0:2]) # merge_columns.append('Location_tmp') compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation =\ df_w_sector.merge(helper_allocation[['Location_tmp', 'Sector', 'HelperFlow']], how='left', left_on=['Location_tmp', sector_col_to_merge], right_on=['Location_tmp', 'Sector']) modified_fba_allocation = modified_fba_allocation.drop(columns=['Location_tmp']) elif (attr['helper_from_scale'] == 'national') and \ (attr['allocation_from_scale'] != 'national'): compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation = df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']], how='left', left_on=[sector_col_to_merge], right_on=['Sector']) else: compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation =\ df_w_sector.merge(helper_allocation[['Location', 'Sector', 'HelperFlow']], left_on=['Location', sector_col_to_merge], right_on=['Location', 'Sector']) # modify flow amounts using helper data if 'multiplication' in attr['helper_method']: # if missing values (na or 0), replace with national level values replacement_values =\ helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index(drop=True) replacement_values = replacement_values.rename(columns={"HelperFlow": 'ReplacementValue'}) compare_df_units(modified_fba_allocation, replacement_values) modified_fba_allocation = modified_fba_allocation.merge( replacement_values[['Sector', 'ReplacementValue']], how='left') modified_fba_allocation.loc[:, 'HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna( modified_fba_allocation['ReplacementValue']) modified_fba_allocation.loc[:, 'HelperFlow'] =\ np.where(modified_fba_allocation['HelperFlow'] == 0, modified_fba_allocation['ReplacementValue'], modified_fba_allocation['HelperFlow']) # replace non-existent helper flow values with a 0, so after multiplying, # don't have incorrect value associated with new unit modified_fba_allocation['HelperFlow'] =\ modified_fba_allocation['HelperFlow'].fillna(value=0) modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['HelperFlow'] # drop columns modified_fba_allocation =\ modified_fba_allocation.drop(columns=["HelperFlow", 'ReplacementValue', 'Sector']) elif attr['helper_method'] == 'proportional': modified_fba_allocation =\ proportional_allocation_by_location_and_activity(modified_fba_allocation, sector_col_to_merge) modified_fba_allocation['FlowAmountRatio'] =\ modified_fba_allocation['FlowAmountRatio'].fillna(0) modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['FlowAmountRatio'] modified_fba_allocation =\ modified_fba_allocation.drop(columns=['FlowAmountRatio', 'HelperFlow', 'Sector']) elif attr['helper_method'] == 'proportional-flagged': # calculate denominators based on activity and 'flagged' column modified_fba_allocation =\ modified_fba_allocation.assign(Denominator= modified_fba_allocation.groupby( ['FlowName', 'ActivityConsumedBy', 'Location', 'disaggregate_flag'] )['HelperFlow'].transform('sum')) modified_fba_allocation = modified_fba_allocation.assign( FlowAmountRatio=modified_fba_allocation['HelperFlow'] / modified_fba_allocation['Denominator']) modified_fba_allocation =\ modified_fba_allocation.assign(FlowAmount=modified_fba_allocation['FlowAmount'] * modified_fba_allocation['FlowAmountRatio']) modified_fba_allocation =\ modified_fba_allocation.drop(columns=['disaggregate_flag', 'Sector', 'HelperFlow', 'Denominator', 'FlowAmountRatio']) # run sector aggregation modified_fba_allocation = sector_aggregation(modified_fba_allocation, fba_mapped_wsec_default_grouping_fields) # drop rows of 0 modified_fba_allocation =\ modified_fba_allocation[modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True) modified_fba_allocation.loc[modified_fba_allocation['Unit'] == 'gal/employee', 'Unit'] = 'gal' # option to scale up fba values if 'scaled' in attr['helper_method']: log.info("Scaling %s to FBA values", attr['helper_source']) modified_fba_allocation = \ dynamically_import_fxn(attr['allocation_source'], attr["scale_helper_results"])(modified_fba_allocation, attr) return modified_fba_allocation