def update_usgs_lake_names(): """ Updates the meta table of valid sites and associated info :return: Pandas Dataframe of sites and associated information """ from utiils import printProgressBar import pandas as pd # TODO Document what each parameter means or have a link to USGS website explaining sites = get_usgs_sites() printProgressBar(0, len(sites), prefix='USGS Metadata Update:', suffix='Complete', length=50) big_df = [] for count, site in enumerate(sites, 1): lake_m_list = "https://waterservices.usgs.gov/nwis/site/?format=rdb&sites={}&" \ "siteOutput=expanded&siteType=LK&siteStatus=all&hasDataTypeCd=iv,dv".format(site) df2 = pd.read_csv(lake_m_list, sep="\t", comment="#", header=[0], low_memory=False).drop([0], axis=0) big_df.append(df2) printProgressBar(count + 1, len(sites), prefix='Retriving new USGS-NWIS Lakes:', suffix='Complete', length=50) return pd.concat(big_df)
def update_grealm_lake_levels(data_table): """ Update Lake Water Levels from the [USDA-GREALM Database](https://ipad.fas.usda.gov/cropexplorer/global_reservoir/) :return: None """ import pandas as pd from sqlalchemy import create_engine import config from utiils import get_lake_table username = config.username password = config.password # Create database connection engines and cursor sql_engine = create_engine('mysql+pymysql://' + username + ':' + password + '@lake-test1.cevt7olsswvw.us-east-2.rds.amazonaws.com:3306/laketest').connect() grealm_sql = u"SELECT `id_No`," \ u"`lake_name`," \ u"metadata->'$.grealm_database_ID' AS `grealm_ID`," \ u"metadata->'$.\"Satellite Observation Period\"' AS `observation_date`" \ u" FROM reference_ID WHERE `source` = 'grealm'" # Read in Grealm unique lake ID and observation dates from reference Table # Clean up the grealm_lakes_info dataframe grealm_lakes_info = pd.read_sql(grealm_sql, con=sql_engine) grealm_lakes_info['grealm_ID'] = grealm_lakes_info['grealm_ID'].str.strip('"') ls_df = [] missing_data = [] from utiils import printProgressBar printProgressBar(0, len(grealm_lakes_info['grealm_ID']), prefix='GREALM-USDA Lake Data Update:', suffix='Complete', length=50) for count, (grealm_id, u_id, name) in enumerate(zip(grealm_lakes_info['grealm_ID'], grealm_lakes_info['id_No'], grealm_lakes_info['lake_name']), 1): # TODO: attempt to get both 10 and 27 day data, merge into 1 df, then add to sql try: target_url = 'https://ipad.fas.usda.gov/lakes/images/lake{}.10d.2.txt'.format(grealm_id.zfill(4)) source_df = pd.read_csv(target_url, skiprows=49, sep='\s+', header=None, parse_dates={'date': [2, 3, 4]}, na_values=[99.99900, 999.99000, 9999.99000], infer_datetime_format=True, error_bad_lines=False, skip_blank_lines=True) except Exception as e: #print('*******************************************************') #print(e) #print('10 day summary does not exist for {}: {}\nRedirecting to 27 day average'.format(grealm_id, name)) try: target_url = 'https://ipad.fas.usda.gov/lakes/images/lake{}.27a.2.txt'.format(grealm_id.zfill(4)) source_df = pd.read_csv(target_url, skiprows=49, sep='\s+', header=None, parse_dates={'date': [2, 3, 4]}, na_values=[99.99900, 999.99000, 9999.99000], infer_datetime_format=True, error_bad_lines=False, skip_blank_lines=True) except Exception as e: #print(e) #print('No Data found for {}: {}'.format(grealm_id, name)) missing_data.append((grealm_id, name)) continue source_df = source_df[['date', 14]] source_df = source_df.dropna() source_df['date'] = pd.to_datetime(source_df['date'], format='%Y%m%d %H %M') source_df['date'] = source_df['date'].dt.strftime('%Y-%m-%d') source_df.insert(0, 'id_No', u_id) source_df.insert(2, 'lake_name', name) source_df = source_df.rename(columns={14: 'water_level'}) ls_df.append(source_df) printProgressBar(count + 1, len(grealm_lakes_info['grealm_ID']), prefix='GREALM-USDA Lake Data Update:', suffix='Complete', length=50) raw_lake_level_df = pd.concat(ls_df, ignore_index=True, copy=False) print('There were {} lake(s) where no GREALM-USDA information could be located'.format(len(missing_data))) existing_database_df = data_table existing_database_df['date'] = pd.to_datetime(existing_database_df['date']) raw_lake_level_df['date'] = pd.to_datetime(raw_lake_level_df['date']) sql_ready_df = pd.merge(left = raw_lake_level_df, right = existing_database_df, on = ['id_No', 'date'], how = 'left', indicator = True).query('_merge == "left_only"').drop(['_merge'], axis = 1) sql_ready_df = sql_ready_df.drop(['lake_name_y', 'water_level_y'], axis=1) sql_ready_df = sql_ready_df.rename(columns={'lake_name_x': 'lake_name', 'water_level_x': 'water_level'}) sql_ready_df = sql_ready_df[~sql_ready_df[['id_No', 'date']].apply(frozenset, axis = 1).duplicated()] sql_ready_df['date'] = sql_ready_df['date'].dt.strftime('%Y-%m-%d') sql_ready_df.to_sql('lake_water_level', con=sql_engine, index=False, if_exists='append', chunksize=2000 ) print("GREALM-USDA Lake Levels Updated") sql_engine.close()
def update_hydroweb_lake_levels(data_table): """ Update Lake Water Levels from the [HydroWeb Database](http://hydroweb.theia-land.fr/) :return: None """ import pandas as pd from sqlalchemy import create_engine from io import BytesIO from zipfile import ZipFile import urllib.request from utiils import printProgressBar import config # %% Section: MetaInfo __author__ = 'John Franey' __credits__ = ['John Franey', 'Jake Gearon'] __version__ = '1.0.0' __maintainer__ = 'John Franey' __email__ = '*****@*****.**' __status__ = 'Development' # %% username = config.username password = config.password sql_engine = create_engine( 'mysql+pymysql://' + username + ':' + password + '@lake-test1.cevt7olsswvw.us-east-2.rds.amazonaws.com:3306/laketest' ).connect() hydroweb_sql = u"SELECT `id_No`," \ u"`lake_name` " \ u" FROM reference_ID WHERE `source` = 'hydroweb'" # Read in Grealm unique lake ID and observation dates from reference Table # Clean up the grealm_lakes_info dataframe hydroweb_lakes_info = pd.read_sql(hydroweb_sql, con=sql_engine) # %% Section: Create df of all available hydroweb data ls_df = [] url = "http://hydroweb.theia-land.fr/hydroweb/authdownload?products=lakes&[email protected]&pwd=vHs98NdXe9BxWNyok*Pp&format=txt" target_url = urllib.request.urlopen(url) with ZipFile(BytesIO(target_url.read())) as my_zip_file: printProgressBar(0, len(my_zip_file.namelist()), prefix='HydroWeb Lake Data Update:', suffix='Complete', length=50) for count, contained_file in enumerate(my_zip_file.namelist(), 1): name = contained_file[11:-4] df = pd.read_csv( my_zip_file.open(contained_file), sep=';', comment='#', skiprows=1, index_col=False, parse_dates=[1], infer_datetime_format=True, names=[ 'Decimal Year', 'date', 'Time | hh:mm', 'lake_level', 'Standard deviation from height (m)', 'Area (km2)', 'Volume with respect to volume of first date (km3)', 'Flag' ]) df['lake_name'] = name.capitalize() # unique_id = hydroweb_lakes_info.loc[hydroweb_lakes_info['lake_name'] == name, 'id_No'] ls_df.append(df) printProgressBar(count + 1, len(my_zip_file.namelist()), prefix='HydroWeb Lake Data Update:', suffix='Complete', length=50) raw_lake_level_df = pd.concat(ls_df, ignore_index=True, copy=False) # %% Section: filter source df filtered_lake_levels = pd.DataFrame() filtered_lake_levels['date'] = raw_lake_level_df[ raw_lake_level_df.columns[1]].dt.strftime('%Y-%m-%d') filtered_lake_levels['water_level'] = raw_lake_level_df[ raw_lake_level_df.columns[3]] filtered_lake_levels['lake_name'] = raw_lake_level_df[ raw_lake_level_df.columns[-1]] # %% Section id_labeled_df = pd.merge(hydroweb_lakes_info, filtered_lake_levels, on=['lake_name']) existing_database_df = data_table existing_database_df['date'] = pd.to_datetime(existing_database_df['date']) id_labeled_df['date'] = pd.to_datetime(id_labeled_df['date']) sql_ready_df = pd.merge( left=id_labeled_df, right=existing_database_df, on=['id_No', 'date'], how='left', indicator=True).query('_merge == "left_only"').drop(['_merge'], axis=1) sql_ready_df = sql_ready_df.drop(['lake_name_y', 'water_level_y'], axis=1) sql_ready_df = sql_ready_df.rename(columns={ 'lake_name_x': 'lake_name', 'water_level_x': 'water_level' }) sql_ready_df = sql_ready_df[~sql_ready_df[['id_No', 'date']]. apply(frozenset, axis=1).duplicated()] sql_ready_df['date'] = sql_ready_df['date'].dt.strftime('%Y-%m-%d') sql_ready_df.to_sql('lake_water_level', con=sql_engine, index=False, if_exists='append', chunksize=2000) print("HydroWeb Lake Levels Updated")
def update_usgs_lake_levels(data_table): """ writes in usgs lake level data to existing database, appending new data dynamically :return: None """ from datetime import datetime import requests from requests.exceptions import HTTPError import pandas as pd from sqlalchemy import create_engine import pymysql from utiils import printProgressBar import config username = config.username password = config.password sql_engine = create_engine('mysql+pymysql://' + username + ':' + password + '@lake-test1.cevt7olsswvw.us-east-2.rds.amazonaws.com:3306/laketest').connect() connection = pymysql.connect(host='lake-test1.cevt7olsswvw.us-east-2.rds.amazonaws.com', user=username, password=password, db='laketest', connect_timeout=100000, ) cursor = connection.cursor() usgs_sql = u"SELECT `id_No`," \ u"`lake_name` " \ u" FROM reference_ID WHERE `source` = 'usgs'" begin_date = '1838-01-01' now = datetime.now() end_date = now.strftime('%Y-%m-%d') # todo for updating, only grab active sites! # todo use modified since for updating! # Read in usgs unique lake ID and observation dates from reference Table usgs_lakes_info = pd.read_sql(usgs_sql, con=sql_engine) # %% sites = get_usgs_sites() df_ls = [] missing_sites = [] printProgressBar(0, len(sites), prefix='USGS Lake Data Update:', suffix='Complete', length=50) for count, site in enumerate(sites, 1): target_url = 'http://waterservices.usgs.gov/nwis/dv/?sites={}&siteType=LK&startDT={}&endDT={}' \ '&statCd=00003,00011,00001,32400,30800,30600&format=json&variable=00062,00065,' \ '30211,62600,62614,62615,62616,62617,62618,72020,' \ '72292,72293,72333,99020,72178,72199,99065,30207,' \ '72214,72264,72275,72335,72336'.format(site, begin_date,end_date).replace("%2C", ",") try: response = requests.get(target_url) response.raise_for_status() # access JSOn content jsonResponse = response.json() site_name = jsonResponse["value"]["timeSeries"][0]['sourceInfo']['siteName'] df = pd.DataFrame.from_dict(jsonResponse["value"]['timeSeries'][0]["values"][0]['value'], orient="columns").drop('qualifiers', axis=1) df["lake_name"] = site_name df_ls.append(df) except HTTPError as http_err: print(f'HTTP error occurred: {http_err}') except IndexError as e: print(e) print('site data for {} not found, check parameters!'.format(site)) missing_sites.append(site) except Exception as err: print(f'Other error occurred: {err}') printProgressBar(count+1, len(sites), prefix='USGS Lake Data Update:', suffix='Complete', length=50) # %% usgs_source_df = pd.concat(df_ls, ignore_index=True, copy=False) usgs_source_df["date"] = pd.to_datetime(usgs_source_df["dateTime"], format='%Y-%m-%d') id_labeled_df = pd.merge(usgs_lakes_info, usgs_source_df, on=['lake_name']) id_labeled_df['date'] = id_labeled_df['date'].dt.strftime('%Y-%m-%d') id_labeled_df = id_labeled_df.drop(['dateTime'], axis=1) id_labeled_df = id_labeled_df.rename(columns={'value': 'water_level'}) existing_database_df = data_table existing_database_df['date'] = pd.to_datetime(existing_database_df['date']) id_labeled_df['date'] = pd.to_datetime(id_labeled_df['date']) # %% sql_ready_df = pd.merge(left = id_labeled_df, right = existing_database_df, on = ['id_No', 'date'], how = 'left', indicator = True).query('_merge == "left_only"').drop(['_merge'], axis = 1) sql_ready_df = sql_ready_df.drop(['lake_name_y', 'water_level_y'], axis=1) sql_ready_df = sql_ready_df.rename(columns={'lake_name_x': 'lake_name', 'water_level_x': 'water_level'}) sql_ready_df = sql_ready_df[~sql_ready_df[['id_No', 'date']].apply(frozenset, axis = 1).duplicated()] sql_ready_df['date'] = sql_ready_df['date'].dt.strftime('%Y-%m-%d') sql_ready_df.to_sql('lake_water_level', con=sql_engine, index=False, if_exists='append', chunksize=100000 ) if not missing_sites: print('All sites processed') else: print('Missing {} out of {} sites.'.format(len(missing_sites), len(sites))) # print('Missing Site IDs:') # for site in missing_sites: # print(site) print("USGS-NWIS Lake Levels Updated") connection.close()
def reference_table_metadata_json_replace(): """ Update lake metadata within the Reference ID table :param usgs_table: dataframe :return: """ import pandas as pd from sqlalchemy import create_engine from lake_table_usgs import update_usgs_lake_names from utiils import printProgressBar import pymysql import json import config username = config.username password = config.password # Create database connection engines and cursor sql_engine = create_engine( 'mysql+pymysql://' + username + ':' + password + '@lake-test1.cevt7olsswvw.us-east-2.rds.amazonaws.com:3306/laketest' ).connect() connection = pymysql.connect( host='lake-test1.cevt7olsswvw.us-east-2.rds.amazonaws.com', user=username, password=password, db='laketest') cursor = connection.cursor() # Read in reference table for unique Lake ID and Lake name # id_table = lake_reference_df sql_command_metadata = u"ALTER TABLE reference_ID ADD COLUMN metadata JSON AFTER lake_name" cursor.execute(sql_command_metadata) id_table = pd.read_sql('select * from reference_ID', con=sql_engine) # Read in grealm summary table and clean dataframe grealm_url = 'https://ipad.fas.usda.gov/lakes/images/LakesReservoirsCSV.txt' grealm_source_df = pd.read_csv(grealm_url, skiprows=3, sep="\t", header=0, parse_dates=[-1], infer_datetime_format=True, error_bad_lines=False, skip_blank_lines=True) grealm_source_df = grealm_source_df[~grealm_source_df['Lake ID'].str. contains("Total")] grealm_source_df = grealm_source_df.rename(columns={ 'Name': 'lake_name', 'Lake ID': 'grealm_database_ID' }) # Rename lake name to <lake name>_<Resolution> if 2 or more versions of the lake exist grealm_source_df.loc[grealm_source_df.lake_name.duplicated(keep=False), 'lake_name'] =\ grealm_source_df.loc[grealm_source_df.lake_name.duplicated(keep=False), 'lake_name'] + '_' +\ grealm_source_df.loc[grealm_source_df.lake_name.duplicated(keep=False), 'Resolution'].astype(str) # Merge reference and grealm tables while keeping unique lake ID number from db, convert to json dict grealm_id_table = id_table[(id_table['source'] == 'grealm')] grealm_id_table = grealm_id_table.loc[grealm_id_table.index.difference( grealm_id_table.dropna().index)] grealm_id_table = grealm_id_table.drop(['metadata'], axis=1) df_grealm = grealm_id_table.merge(grealm_source_df, on='lake_name', how='inner') df_grealm = df_grealm.drop_duplicates(subset=['id_No']) df_grealm = df_grealm.set_index('id_No') df_grealm.dropna(axis=1, how='all', inplace=True) grealm_json = df_grealm.to_json(orient='index') grealm_json = grealm_json.replace('null', '"null"') try: grealm_dict = eval(grealm_json) except NameError: grealm_dict = {} print('grealm metadata prepped') # repeat process with hydroweb summary table, results in json dict with Unique lake ID hydroweb_url = 'http://hydroweb.theia-land.fr/hydroweb/authdownload?list=lakes&format=txt' hydroweb_df = pd.read_csv(hydroweb_url) hydroweb_df = hydroweb_df.rename(columns={'lake': 'lake_name'}) hydroweb_id_table = id_table.loc[id_table['source'] == 'hydroweb'] hydroweb_id_table = hydroweb_id_table.loc[ hydroweb_id_table.index.difference(hydroweb_id_table.dropna().index)] hydroweb_id_table = hydroweb_id_table.drop(['metadata'], axis=1) hydroweb_indexed_df = pd.merge(hydroweb_df, hydroweb_id_table, on='lake_name') hydroweb_indexed_df = hydroweb_indexed_df.set_index('id_No') hydroweb_json = hydroweb_indexed_df.to_json(orient='index') hydroweb_dict = eval(hydroweb_json) print('hydroweb metadata prepped') # USGS metadata requires use of functions from lake_table_usgs.py, but end result is json dict with unique lake ID #usgs_df = id_table.loc[id_table['source'] == 'usgs'] #usgs_df = update_usgs_lake_names() usgs_df = pd.read_csv('usgs_test_from_id_table.csv') #usgs_df.to_csv('usgs_test_from_id_table.csv') usgs_df = usgs_df.rename(columns={'station_nm': 'lake_name'}) usgs_id_table = id_table.loc[id_table['source'] == 'usgs'] usgs_id_table = usgs_id_table.loc[usgs_id_table.index.difference( usgs_id_table.dropna().index)] usgs_df = pd.merge(usgs_df, usgs_id_table, on='lake_name') usgs_df = usgs_df.set_index('id_No') usgs_df = usgs_df.drop(['metadata'], axis=1) usgs_dict = usgs_df.to_json(orient='index') usgs_dict = usgs_dict.replace('true', '"true"') usgs_dict = usgs_dict.replace('false', '"false"') usgs_dict = usgs_dict.replace('null', '"null"') usgs_dict = eval(usgs_dict) print('USGS metadata prepped') # print(usgs_dict) print(grealm_dict) # print(hydroweb_dict) cursor = connection.cursor() # Execute mysql commands sql_command = u"UPDATE `reference_ID` SET `metadata` = (%s) WHERE `id_No` = (%s);" if len(grealm_dict.values()) > 0: printProgressBar(0, len(grealm_dict.values()), prefix='G-REALM:', suffix='Complete', length=50) for count, (key, value) in enumerate(grealm_dict.items(), 1): cursor.execute(sql_command, (json.dumps(value), key)) printProgressBar(count + 1, len(grealm_dict.values()), prefix='GREALM-USDA:', suffix='Complete', length=50) connection.commit() if len(hydroweb_dict.values()) > 0: printProgressBar(0, len(hydroweb_dict.values()), prefix='HydroWeb:', suffix='Complete', length=50) for count, (key, value) in enumerate(hydroweb_dict.items(), 1): cursor.execute(sql_command, (json.dumps(value), key)) printProgressBar(count + 1, len(hydroweb_dict.values()), prefix='HydroWeb:', suffix='Complete', length=50) connection.commit() if len(usgs_dict.values()) > 0: printProgressBar(0, len(usgs_dict.values()), prefix='USGS-NWIS:', suffix='Complete', length=50) for count, (key, value) in enumerate(usgs_dict.items(), 1): cursor.execute(sql_command, (json.dumps(value), key)) printProgressBar(count + 1, len(usgs_dict.values()), prefix='USGS-NWIS:', suffix='Complete', length=50) connection.commit() connection.close() sql_engine.close()