def column_names(file_name): base_url = 'https://www.sciencebase.gov/catalog/file/get/' pacific_region = ['5d407318e4b01d82ce8d9b3c?f=__disk__22%2F5c%2Fe3%2F225' 'ce31141477eb0904f38f95f1d472bbe2a2a11', '5d407318e4b01d82ce8d9b3c?f=__disk__2b%2F75%2F2b%2F2b7' '52b0c5decf8e83c035d559a2688c481bb0cfe'] midwestern = ['5cbf5150e4b09b8c0b700df3?f=__disk__66%2F4f%2Ff2%2F664ff289' '064560bbce748082f7b34593dad49ca2', '5cbf5150e4b09b8c0b700df3?f=__disk__bf%2F73%2F1f%2Fbf731fdf' '4e984a5cf50c0f1a140cda366cb8c1d3'] northeastern = ['5d4192aee4b01d82ce8da477?f=__disk__c2%2F02%2F06%2Fc202060' '78520c5ec87394a3499eea073f472a27d', '5d4192aee4b01d82ce8da477?f=__disk__b0%2Fb9%2F35%2Fb0b9350' '21a47ccf57f7584cc7f14d82aacc491d1'] southwestern = ['5f8f1f1282ce06b040efc90e?f=__disk__f8%2Fb8%2Ff9%2Ff8b8f9' 'bdc2a07f014ed6dced8feb2dd7bc63e056', '5f8f1f1282ce06b040efc90e?f=__disk__8e%2F8e%2Fb8%2F8e8eb8' '203ea14ab19a45372919a0dbf667d033b2'] southeastern = ['5d6e70e5e4b0c4f70cf635a1?f=__disk__fb%2Fdb%2F92%2Ffbdb928' '1872069b23bcd134a4c5fa1ddc7280b53', '5d6e70e5e4b0c4f70cf635a1?f=__disk__14%2Fc1%2F63%2F14c1636' 'eef91529f548d5fe29ff3f426d3b4b996'] if file_name in pacific_region: legend_name = "5d407318e4b01d82ce8d9b3c?f=__disk__ab%2F27%2F08%2Fab" \ "27083f354bd851ec09bc0f33c2dc130f808bb5" elif file_name in midwestern: legend_name = "5cbf5150e4b09b8c0b700df3?f=__disk__a6%2Ffb%2Fd6%2Fa6f" \ "bd6f6bcce874109d2e989d1d4d5a67c33cd49" elif file_name in northeastern: legend_name = "5d4192aee4b01d82ce8da477?f=__disk__81%2F5d%2F3d%2F815" \ "d3deb08f82c1662ff94eb941074ff99c75088" elif file_name in southwestern: legend_name = "5f8f1f1282ce06b040efc90e?f=__disk__44%2Ff6%2F74%2F44f" \ "674b54b2fa571191a597c8dfae0923893d3d3" elif file_name in southeastern: legend_name = "5d6e70e5e4b0c4f70cf635a1?f=__disk__93%2Fba%2F5c%2F93b" \ "a5c50c58ced4116ad2e5b9783fc7848ab2cb5" contents = make_url_request(base_url + legend_name) xslt_content = contents.content.decode('utf-8') root = ET.fromstring(xslt_content) label = [] name = [] for attr in root.iter('attr'): for child in attr: if str(child.tag) == 'attrlabl': label.append(str(child.text)) if str(child.tag) == 'attrdef': name.append(str(child.text)) legend = pd.DataFrame() legend["label"] = label legend["name"] = name return legend
def get_data_commons_index(file_meta, paths): """Returns a dataframe of files available on data commmons for the particular category :param file_meta: instance of class FileMeta :param paths: instance of class Path :param category: str of the category to search e.g. 'flowsa/FlowByActivity' :return: dataframe with 'date' and 'file_name' as fields """ index_url = '?prefix=' subdirectory = file_meta.tool + '/' if file_meta.category != '': subdirectory = subdirectory + file_meta.category + '/' url = paths.remote_path + index_url + subdirectory listing = make_url_request(url) # Code to convert XML to pd df courtesy of # https://stackabuse.com/reading-and-writing-xml-files-in-python-with-panda contents = ET.XML(listing.text) data = [] cols = [] for i, child in enumerate(contents): data.append([subchild.text for subchild in child]) cols.append(child.tag) df = pd.DataFrame(data) df.dropna(inplace=True) try: # only get first two columns and rename them name and last modified df = df[[0, 1]] except KeyError: # no data found at url return None df.columns = ['file_name', 'last_modified'] # Reformat the date to a pd datetime df['date'] = pd.to_datetime(df['last_modified'], format='%Y-%m-%dT%H:%M:%S') # Remove the category name and trailing slash from the file name df['file_name'] = df['file_name'].str.replace(subdirectory, "") # Reset the index and return df = df[['date', 'file_name']].reset_index(drop=True) return df
def call_urls(*, url_list, source, year, config): """ This method calls all the urls that have been generated. It then calls the processing method to begin processing the returned data. The processing method is specific to the data source, so this function relies on a function in source.py :param url_list: list, urls to call :param source: str, data source :param year: str, year :param config: dictionary, FBA yaml :return: list, dfs to concat and parse """ # identify if url request requires cookies set set_cookies = config.get('allow_http_request_cookies') confirm_gdrive = config.get('confirm_gdrive') # create dataframes list by iterating through url list data_frames_list = [] if url_list[0] is not None: for url in url_list: log.info("Calling %s", url) resp = make_url_request(url, set_cookies=set_cookies, confirm_gdrive=confirm_gdrive) if "call_response_fxn" in config: # dynamically import and call on function df = dynamically_import_fxn( source, config["call_response_fxn"])(resp=resp, source=source, year=year, config=config, url=url) if isinstance(df, pd.DataFrame): data_frames_list.append(df) elif isinstance(df, list): data_frames_list.extend(df) return data_frames_list
def download_eGRID(year): """Download eGRID files from EPA website.""" log.info(f'downloading eGRID data for {year}') download_url = _config[year]['download_url'] egrid_file_name = _config[year]['file_name'] r = make_url_request(download_url) # extract .xlsx workbook if year == '2016' or year == '2014': z = zipfile.ZipFile(io.BytesIO(r.content)) workbook = z.read(egrid_file_name) else: workbook = r.content # save .xlsx workbook to destination directory destination = OUTPUT_PATH.joinpath(egrid_file_name) # if destination folder does not already exist, create it OUTPUT_PATH.mkdir(parents=True, exist_ok=True) with open(destination, 'wb') as output: output.write(workbook) log.info(f'{egrid_file_name} saved to {OUTPUT_PATH}')
def download_from_remote(file_meta, paths, **kwargs): """ Downloads one or more files from remote and stores locally based on the most recent instance of that file. All files that share name_data, version, and hash will be downloaded together. :param file_meta: populated instance of class FileMeta :param paths: instance of class Paths :param kwargs: option to include 'subdirectory_dict', a dictionary that directs local data storage location based on extension """ base_url = paths.remote_path + file_meta.tool + '/' if file_meta.category != '': base_url = base_url + file_meta.category + '/' files = get_most_recent_from_index(file_meta, paths) if files == []: log.info('%s not found in %s', file_meta.name_data, base_url) else: for f in files: url = base_url + f r = make_url_request(url) if r is not None: # set subdirectory subdirectory = file_meta.category # if there is a dictionary with specific subdirectories # based on end of filename, modify the subdirectory if kwargs != {}: if 'subdirectory_dict' in kwargs: for k, v in kwargs['subdirectory_dict'].items(): if f.endswith(k): subdirectory = v folder = os.path.realpath(paths.local_path + '/' + subdirectory) file = folder + "/" + f create_paths_if_missing(file) log.info('%s saved to %s', f, folder) with open(file, 'wb') as f: f.write(r.content)
def annual_fips(years): """Fxn to pull the FIPS codes/names from the Census website. Columns are renamed amd subset.""" # list of years to include in FIPS crosswalk df_list = {} for year in years: # only works for 2015 +....contacted Census on 5/1 to ask for county level # fips for previous years if year == '2013': url = 'https://www2.census.gov/programs-surveys/popest/geographies/' + \ year + '/all-geocodes-v' + year + '.xls' else: url = "https://www2.census.gov/programs-surveys/popest/geographies/" + \ year + "/all-geocodes-v" + year + ".xlsx" r = make_url_request(url) raw_df = pd.read_excel(io.BytesIO( r.content)).dropna().reset_index(drop=True) # skip the first few rows FIPS_df = pd.DataFrame(raw_df.loc[1:]).reindex() # Assign the column titles (remove whitespace if exists and new lines FIPS_df.columns = raw_df.loc[0, ].str.replace(' |\\n', '') original_cols = FIPS_df.columns # Create a dictionary of geographic levels geocode_levels = { "010": "Country", "040": "State", "050": "County_" + year } level_codes = geocode_levels.keys() # filter df for records with the levels of interest FIPS_df = FIPS_df.loc[FIPS_df["SummaryLevel"].isin(level_codes)] # split df by level to return a list of dfs # use a list comprehension to split it out FIPS_bylevel = [ pd.DataFrame(y) for x, y in FIPS_df.groupby("SummaryLevel", as_index=False) ] # Assume df order in list is in geolevels keys order # country does not have its own field state_and_county_fields = { "Country": ["StateCode(FIPS)"], "State": ["StateCode(FIPS)"], "County_" + year: ["StateCode(FIPS)", "CountyCode(FIPS)"] } name_field = "AreaName(includinglegal/statisticalareadescription)" new_dfs = {} for df in FIPS_bylevel: df = df.reset_index(drop=True) level = geocode_levels[df.loc[0, "SummaryLevel"]] new_df = df[original_cols] new_df = new_df.rename(columns={name_field: level}) fields_to_keep = [str(x) for x in state_and_county_fields[level]] fields_to_keep.append(level) new_df = new_df[fields_to_keep] # Write each to the list new_dfs[level] = new_df # New merge the new dfs to add the info # FIPS_df_new = FIPS_df for k, v in new_dfs.items(): fields_to_merge = [str(x) for x in state_and_county_fields[k]] # FIPS_df_new = pd.merge(FIPS_df_new,v,on=fields_to_merge,how="left") FIPS_df = pd.merge(FIPS_df, v, on=fields_to_merge, how="left") # combine state and county codes FIPS_df['FIPS_' + year] = \ FIPS_df[state_and_county_fields["County_" + year][0]].astype(str) + \ FIPS_df[state_and_county_fields["County_" + year][1]].astype(str) fields_to_keep = ["State", "County_" + year, "FIPS_" + year] FIPS_df = FIPS_df[fields_to_keep] # Clean the county field - remove the " County" # FIPS_df["County"] = FIPS_df["County"].apply(lambda x:stripcounty(x)) FIPS_df["County_" + year] = FIPS_df["County_" + year].apply(stripcounty) FIPS_df["County_" + year] = FIPS_df["County_" + year].apply(clean_str_and_capitalize) FIPS_df["State"] = FIPS_df["State"].apply(clean_str_and_capitalize) # add to data dictionary of fips years df_list["FIPS_" + year] = FIPS_df return df_list
Script creates crosswalks for Land and Water """ import io import pandas as pd from esupy.remote import make_url_request from flowsa.settings import datapath from flowsa.data_source_scripts.EIA_CBECS_Land import standardize_eia_cbecs_land_activity_names if __name__ == '__main__': # url for excel crosswalk url = 'http://www.eia.gov/consumption/commercial/data/archive/cbecs/PBAvsNAICS.xls' # make url requestl, as defined in common.py r = make_url_request(url) # Convert response to dataframe, skipping first three rows df_raw = pd.read_excel(io.BytesIO(r.content), skiprows=3) # Rename first column to sector (naics 2002) df = df_raw.rename(columns={df_raw.columns[0]: "Sector"}) # remove row of just NAs df = df[df['Sector'].notna()] # remove description in first column df['Sector'] = df['Sector'].str.split('/').str[0] # reshape data to long format and name columns df = pd.melt(df, id_vars=['Sector']) df.columns = ['Sector', 'Activity', 'value']
# write_Larson_UrbanPublicParks_SI.py (scripts) # !/usr/bin/env python3 # coding=utf-8 """ Load and save the SI parks data from Larson LR, Jennings V, Cloutier SA (2016) Public Parks and Wellbeing in Urban Areas of the United States. PLoS ONE 11(4): e0153211. https://doi.org/10.1371/journal.pone.0153211 SI obtained 08/26/2020 """ import io import pandas as pd from esupy.remote import make_url_request from flowsa.settings import externaldatapath # 2012--2018 fisheries data at state level csv_load = "https://doi.org/10.1371/journal.pone.0153211.s001" if __name__ == '__main__': response = make_url_request(csv_load) # Read directly into a pandas df raw_df = pd.read_excel(io.BytesIO( response.content)).dropna().reset_index(drop=True) # save data to csv raw_df.to_csv(externaldatapath + "Larson_UrbanPublicParks_SI.csv", index=False)