def update_usgs_lake_names():
    """
    Updates the meta table of valid sites and associated info
    :return: Pandas Dataframe of sites and associated information
    """
    from utiils import printProgressBar
    import pandas as pd
    # TODO Document what each parameter means or have a link to USGS website explaining
    sites = get_usgs_sites()
    printProgressBar(0, len(sites), prefix='USGS Metadata Update:', suffix='Complete', length=50)
    big_df = []
    for count, site in enumerate(sites, 1):
        lake_m_list = "https://waterservices.usgs.gov/nwis/site/?format=rdb&sites={}&" \
                      "siteOutput=expanded&siteType=LK&siteStatus=all&hasDataTypeCd=iv,dv".format(site)
        df2 = pd.read_csv(lake_m_list, sep="\t", comment="#", header=[0], low_memory=False).drop([0], axis=0)
        big_df.append(df2)
        printProgressBar(count + 1, len(sites), prefix='Retriving new USGS-NWIS Lakes:', suffix='Complete', length=50)
    return pd.concat(big_df)
Example #2
0
def update_grealm_lake_levels(data_table):
    """
    Update Lake Water Levels from the [USDA-GREALM Database](https://ipad.fas.usda.gov/cropexplorer/global_reservoir/)
    :return: None
    """
    import pandas as pd
    from sqlalchemy import create_engine
    import config
    from utiils import get_lake_table

    username = config.username
    password = config.password
    # Create database connection engines and cursor
    sql_engine = create_engine('mysql+pymysql://' + username + ':' + password +
                               '@lake-test1.cevt7olsswvw.us-east-2.rds.amazonaws.com:3306/laketest').connect()

    grealm_sql = u"SELECT `id_No`," \
                 u"`lake_name`," \
                 u"metadata->'$.grealm_database_ID' AS `grealm_ID`," \
                 u"metadata->'$.\"Satellite Observation Period\"' AS `observation_date`" \
                 u" FROM reference_ID WHERE `source` = 'grealm'"

    # Read in Grealm unique lake ID and observation dates from reference Table
    # Clean up the grealm_lakes_info dataframe

    grealm_lakes_info = pd.read_sql(grealm_sql, con=sql_engine)
    grealm_lakes_info['grealm_ID'] = grealm_lakes_info['grealm_ID'].str.strip('"')


    ls_df = []
    missing_data = []
    from utiils import printProgressBar
    printProgressBar(0, len(grealm_lakes_info['grealm_ID']), prefix='GREALM-USDA Lake Data Update:', suffix='Complete',
                     length=50)
    for count, (grealm_id, u_id, name) in enumerate(zip(grealm_lakes_info['grealm_ID'],
                                     grealm_lakes_info['id_No'],
                                     grealm_lakes_info['lake_name']), 1):
        # TODO: attempt to get both 10 and 27 day data, merge into 1 df, then add to sql
        try:
            target_url = 'https://ipad.fas.usda.gov/lakes/images/lake{}.10d.2.txt'.format(grealm_id.zfill(4))
            source_df = pd.read_csv(target_url, skiprows=49, sep='\s+', header=None, parse_dates={'date': [2, 3, 4]},
                                    na_values=[99.99900, 999.99000, 9999.99000], infer_datetime_format=True,
                                    error_bad_lines=False, skip_blank_lines=True)
        except Exception as e:
            #print('*******************************************************')
            #print(e)
            #print('10 day summary does not exist for {}: {}\nRedirecting to 27 day average'.format(grealm_id, name))

            try:
                target_url = 'https://ipad.fas.usda.gov/lakes/images/lake{}.27a.2.txt'.format(grealm_id.zfill(4))
                source_df = pd.read_csv(target_url, skiprows=49, sep='\s+', header=None, parse_dates={'date': [2, 3, 4]},
                                        na_values=[99.99900, 999.99000, 9999.99000], infer_datetime_format=True,
                                        error_bad_lines=False, skip_blank_lines=True)
            except Exception as e:
                #print(e)
                #print('No Data found for {}: {}'.format(grealm_id, name))
                missing_data.append((grealm_id, name))
                continue

        source_df = source_df[['date', 14]]
        source_df = source_df.dropna()
        source_df['date'] = pd.to_datetime(source_df['date'], format='%Y%m%d %H %M')
        source_df['date'] = source_df['date'].dt.strftime('%Y-%m-%d')
        source_df.insert(0, 'id_No', u_id)
        source_df.insert(2, 'lake_name', name)
        source_df = source_df.rename(columns={14: 'water_level'})
        ls_df.append(source_df)
        printProgressBar(count + 1, len(grealm_lakes_info['grealm_ID']), prefix='GREALM-USDA Lake Data Update:',
                         suffix='Complete',
                         length=50)
    raw_lake_level_df = pd.concat(ls_df, ignore_index=True, copy=False)
    print('There were {} lake(s) where no GREALM-USDA information could be located'.format(len(missing_data)))
    existing_database_df = data_table

    existing_database_df['date'] = pd.to_datetime(existing_database_df['date'])
    raw_lake_level_df['date'] = pd.to_datetime(raw_lake_level_df['date'])

    sql_ready_df = pd.merge(left = raw_lake_level_df, right = existing_database_df, on = ['id_No', 'date'], how = 'left',
                            indicator = True).query('_merge == "left_only"').drop(['_merge'], axis = 1)
    sql_ready_df = sql_ready_df.drop(['lake_name_y', 'water_level_y'], axis=1)
    sql_ready_df = sql_ready_df.rename(columns={'lake_name_x': 'lake_name', 'water_level_x': 'water_level'})
    sql_ready_df = sql_ready_df[~sql_ready_df[['id_No', 'date']].apply(frozenset, axis = 1).duplicated()]
    sql_ready_df['date'] = sql_ready_df['date'].dt.strftime('%Y-%m-%d')

    sql_ready_df.to_sql('lake_water_level',
                        con=sql_engine,
                        index=False,
                        if_exists='append',
                        chunksize=2000
                        )
    print("GREALM-USDA Lake Levels Updated")
    sql_engine.close()
Example #3
0
def update_hydroweb_lake_levels(data_table):
    """
    Update Lake Water Levels from the [HydroWeb Database](http://hydroweb.theia-land.fr/)

    :return: None
    """
    import pandas as pd
    from sqlalchemy import create_engine
    from io import BytesIO
    from zipfile import ZipFile
    import urllib.request
    from utiils import printProgressBar
    import config

    # %% Section: MetaInfo
    __author__ = 'John Franey'
    __credits__ = ['John Franey', 'Jake Gearon']

    __version__ = '1.0.0'
    __maintainer__ = 'John Franey'
    __email__ = '*****@*****.**'
    __status__ = 'Development'
    # %%
    username = config.username
    password = config.password
    sql_engine = create_engine(
        'mysql+pymysql://' + username + ':' + password +
        '@lake-test1.cevt7olsswvw.us-east-2.rds.amazonaws.com:3306/laketest'
    ).connect()
    hydroweb_sql = u"SELECT `id_No`," \
                   u"`lake_name` " \
                   u" FROM reference_ID WHERE `source` = 'hydroweb'"

    # Read in Grealm unique lake ID and observation dates from reference Table
    # Clean up the grealm_lakes_info dataframe
    hydroweb_lakes_info = pd.read_sql(hydroweb_sql, con=sql_engine)
    # %% Section: Create df of all available hydroweb data
    ls_df = []
    url = "http://hydroweb.theia-land.fr/hydroweb/authdownload?products=lakes&[email protected]&pwd=vHs98NdXe9BxWNyok*Pp&format=txt"
    target_url = urllib.request.urlopen(url)

    with ZipFile(BytesIO(target_url.read())) as my_zip_file:
        printProgressBar(0,
                         len(my_zip_file.namelist()),
                         prefix='HydroWeb Lake Data Update:',
                         suffix='Complete',
                         length=50)
        for count, contained_file in enumerate(my_zip_file.namelist(), 1):
            name = contained_file[11:-4]
            df = pd.read_csv(
                my_zip_file.open(contained_file),
                sep=';',
                comment='#',
                skiprows=1,
                index_col=False,
                parse_dates=[1],
                infer_datetime_format=True,
                names=[
                    'Decimal Year', 'date', 'Time | hh:mm', 'lake_level',
                    'Standard deviation from height (m)', 'Area (km2)',
                    'Volume with respect to volume of first date (km3)', 'Flag'
                ])
            df['lake_name'] = name.capitalize()
            # unique_id = hydroweb_lakes_info.loc[hydroweb_lakes_info['lake_name'] == name, 'id_No']
            ls_df.append(df)
            printProgressBar(count + 1,
                             len(my_zip_file.namelist()),
                             prefix='HydroWeb Lake Data Update:',
                             suffix='Complete',
                             length=50)
    raw_lake_level_df = pd.concat(ls_df, ignore_index=True, copy=False)

    # %% Section: filter source df
    filtered_lake_levels = pd.DataFrame()
    filtered_lake_levels['date'] = raw_lake_level_df[
        raw_lake_level_df.columns[1]].dt.strftime('%Y-%m-%d')
    filtered_lake_levels['water_level'] = raw_lake_level_df[
        raw_lake_level_df.columns[3]]
    filtered_lake_levels['lake_name'] = raw_lake_level_df[
        raw_lake_level_df.columns[-1]]
    # %% Section
    id_labeled_df = pd.merge(hydroweb_lakes_info,
                             filtered_lake_levels,
                             on=['lake_name'])
    existing_database_df = data_table

    existing_database_df['date'] = pd.to_datetime(existing_database_df['date'])
    id_labeled_df['date'] = pd.to_datetime(id_labeled_df['date'])
    sql_ready_df = pd.merge(
        left=id_labeled_df,
        right=existing_database_df,
        on=['id_No', 'date'],
        how='left',
        indicator=True).query('_merge == "left_only"').drop(['_merge'], axis=1)
    sql_ready_df = sql_ready_df.drop(['lake_name_y', 'water_level_y'], axis=1)
    sql_ready_df = sql_ready_df.rename(columns={
        'lake_name_x': 'lake_name',
        'water_level_x': 'water_level'
    })
    sql_ready_df = sql_ready_df[~sql_ready_df[['id_No', 'date']].
                                apply(frozenset, axis=1).duplicated()]
    sql_ready_df['date'] = sql_ready_df['date'].dt.strftime('%Y-%m-%d')

    sql_ready_df.to_sql('lake_water_level',
                        con=sql_engine,
                        index=False,
                        if_exists='append',
                        chunksize=2000)
    print("HydroWeb Lake Levels Updated")
def update_usgs_lake_levels(data_table):
    """
    writes in usgs lake level data to existing database, appending new data dynamically
    :return: None
    """
    from datetime import datetime
    import requests
    from requests.exceptions import HTTPError
    import pandas as pd
    from sqlalchemy import create_engine
    import pymysql
    from utiils import printProgressBar
    import config

    username = config.username
    password = config.password

    sql_engine = create_engine('mysql+pymysql://' + username + ':' + password +
                               '@lake-test1.cevt7olsswvw.us-east-2.rds.amazonaws.com:3306/laketest').connect()
    connection = pymysql.connect(host='lake-test1.cevt7olsswvw.us-east-2.rds.amazonaws.com',
                                 user=username,
                                 password=password,
                                 db='laketest',
                                 connect_timeout=100000,
                                 )
    cursor = connection.cursor()

    usgs_sql = u"SELECT `id_No`," \
               u"`lake_name` " \
               u" FROM reference_ID WHERE `source` = 'usgs'"

    begin_date = '1838-01-01'
    now = datetime.now()
    end_date = now.strftime('%Y-%m-%d')
    # todo for updating, only grab active sites!
    # todo use modified since for updating!
    # Read in usgs unique lake ID and observation dates from reference Table
    usgs_lakes_info = pd.read_sql(usgs_sql, con=sql_engine)
    # %%
    sites = get_usgs_sites()
    df_ls = []
    missing_sites = []
    printProgressBar(0, len(sites), prefix='USGS Lake Data Update:', suffix='Complete', length=50)
    for count, site in enumerate(sites, 1):
        target_url = 'http://waterservices.usgs.gov/nwis/dv/?sites={}&siteType=LK&startDT={}&endDT={}' \
                     '&statCd=00003,00011,00001,32400,30800,30600&format=json&variable=00062,00065,' \
                     '30211,62600,62614,62615,62616,62617,62618,72020,' \
                     '72292,72293,72333,99020,72178,72199,99065,30207,' \
                     '72214,72264,72275,72335,72336'.format(site, begin_date,end_date).replace("%2C", ",")
        try:
            response = requests.get(target_url)
            response.raise_for_status()
            # access JSOn content
            jsonResponse = response.json()
            site_name = jsonResponse["value"]["timeSeries"][0]['sourceInfo']['siteName']
            df = pd.DataFrame.from_dict(jsonResponse["value"]['timeSeries'][0]["values"][0]['value'],
                                        orient="columns").drop('qualifiers', axis=1)
            df["lake_name"] = site_name
            df_ls.append(df)

        except HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}')
        except IndexError as e:
            print(e)
            print('site data for {} not found, check parameters!'.format(site))
            missing_sites.append(site)
        except Exception as err:
            print(f'Other error occurred: {err}')

        printProgressBar(count+1, len(sites), prefix='USGS Lake Data Update:', suffix='Complete', length=50)

    # %%
    usgs_source_df = pd.concat(df_ls, ignore_index=True, copy=False)
    usgs_source_df["date"] = pd.to_datetime(usgs_source_df["dateTime"], format='%Y-%m-%d')
    id_labeled_df = pd.merge(usgs_lakes_info, usgs_source_df, on=['lake_name'])
    id_labeled_df['date'] = id_labeled_df['date'].dt.strftime('%Y-%m-%d')
    id_labeled_df = id_labeled_df.drop(['dateTime'], axis=1)
    id_labeled_df = id_labeled_df.rename(columns={'value': 'water_level'})

    existing_database_df = data_table

    existing_database_df['date'] = pd.to_datetime(existing_database_df['date'])
    id_labeled_df['date'] = pd.to_datetime(id_labeled_df['date'])

    # %%
    sql_ready_df = pd.merge(left = id_labeled_df, right = existing_database_df, on = ['id_No', 'date'], how = 'left',
                            indicator = True).query('_merge == "left_only"').drop(['_merge'], axis = 1)
    sql_ready_df = sql_ready_df.drop(['lake_name_y', 'water_level_y'], axis=1)
    sql_ready_df = sql_ready_df.rename(columns={'lake_name_x': 'lake_name', 'water_level_x': 'water_level'})
    sql_ready_df = sql_ready_df[~sql_ready_df[['id_No', 'date']].apply(frozenset, axis = 1).duplicated()]
    sql_ready_df['date'] = sql_ready_df['date'].dt.strftime('%Y-%m-%d')
    sql_ready_df.to_sql('lake_water_level',
                        con=sql_engine,
                        index=False,
                        if_exists='append',
                        chunksize=100000
                        )
    if not missing_sites:
        print('All sites processed')
    else:
        print('Missing {} out of {} sites.'.format(len(missing_sites), len(sites)))
        # print('Missing Site IDs:')
        # for site in missing_sites:
        #     print(site)
    print("USGS-NWIS Lake Levels Updated")
    connection.close()
Example #5
0
def reference_table_metadata_json_replace():
    """
    Update lake metadata within the Reference ID table
    :param usgs_table: dataframe
    :return:
    """
    import pandas as pd
    from sqlalchemy import create_engine
    from lake_table_usgs import update_usgs_lake_names
    from utiils import printProgressBar
    import pymysql
    import json
    import config

    username = config.username
    password = config.password

    # Create database connection engines and cursor
    sql_engine = create_engine(
        'mysql+pymysql://' + username + ':' + password +
        '@lake-test1.cevt7olsswvw.us-east-2.rds.amazonaws.com:3306/laketest'
    ).connect()
    connection = pymysql.connect(
        host='lake-test1.cevt7olsswvw.us-east-2.rds.amazonaws.com',
        user=username,
        password=password,
        db='laketest')
    cursor = connection.cursor()

    # Read in reference table for unique Lake ID and Lake name
    # id_table = lake_reference_df

    sql_command_metadata = u"ALTER TABLE reference_ID ADD COLUMN metadata JSON AFTER lake_name"

    cursor.execute(sql_command_metadata)
    id_table = pd.read_sql('select * from reference_ID', con=sql_engine)
    # Read in grealm summary table and clean dataframe
    grealm_url = 'https://ipad.fas.usda.gov/lakes/images/LakesReservoirsCSV.txt'
    grealm_source_df = pd.read_csv(grealm_url,
                                   skiprows=3,
                                   sep="\t",
                                   header=0,
                                   parse_dates=[-1],
                                   infer_datetime_format=True,
                                   error_bad_lines=False,
                                   skip_blank_lines=True)
    grealm_source_df = grealm_source_df[~grealm_source_df['Lake ID'].str.
                                        contains("Total")]
    grealm_source_df = grealm_source_df.rename(columns={
        'Name': 'lake_name',
        'Lake ID': 'grealm_database_ID'
    })

    # Rename lake name to <lake name>_<Resolution> if 2 or more versions of the lake exist
    grealm_source_df.loc[grealm_source_df.lake_name.duplicated(keep=False), 'lake_name'] =\
        grealm_source_df.loc[grealm_source_df.lake_name.duplicated(keep=False), 'lake_name'] + '_' +\
        grealm_source_df.loc[grealm_source_df.lake_name.duplicated(keep=False), 'Resolution'].astype(str)

    # Merge reference and grealm tables while keeping unique lake ID number from db, convert to json dict
    grealm_id_table = id_table[(id_table['source'] == 'grealm')]
    grealm_id_table = grealm_id_table.loc[grealm_id_table.index.difference(
        grealm_id_table.dropna().index)]
    grealm_id_table = grealm_id_table.drop(['metadata'], axis=1)
    df_grealm = grealm_id_table.merge(grealm_source_df,
                                      on='lake_name',
                                      how='inner')
    df_grealm = df_grealm.drop_duplicates(subset=['id_No'])
    df_grealm = df_grealm.set_index('id_No')
    df_grealm.dropna(axis=1, how='all', inplace=True)
    grealm_json = df_grealm.to_json(orient='index')
    grealm_json = grealm_json.replace('null', '"null"')
    try:
        grealm_dict = eval(grealm_json)
    except NameError:
        grealm_dict = {}

    print('grealm metadata prepped')

    # repeat process with hydroweb summary table, results in json dict with Unique lake ID
    hydroweb_url = 'http://hydroweb.theia-land.fr/hydroweb/authdownload?list=lakes&format=txt'
    hydroweb_df = pd.read_csv(hydroweb_url)
    hydroweb_df = hydroweb_df.rename(columns={'lake': 'lake_name'})
    hydroweb_id_table = id_table.loc[id_table['source'] == 'hydroweb']
    hydroweb_id_table = hydroweb_id_table.loc[
        hydroweb_id_table.index.difference(hydroweb_id_table.dropna().index)]
    hydroweb_id_table = hydroweb_id_table.drop(['metadata'], axis=1)
    hydroweb_indexed_df = pd.merge(hydroweb_df,
                                   hydroweb_id_table,
                                   on='lake_name')
    hydroweb_indexed_df = hydroweb_indexed_df.set_index('id_No')
    hydroweb_json = hydroweb_indexed_df.to_json(orient='index')
    hydroweb_dict = eval(hydroweb_json)

    print('hydroweb metadata prepped')
    # USGS metadata requires use of functions from lake_table_usgs.py, but end result is json dict with unique lake ID
    #usgs_df = id_table.loc[id_table['source'] == 'usgs']
    #usgs_df = update_usgs_lake_names()
    usgs_df = pd.read_csv('usgs_test_from_id_table.csv')
    #usgs_df.to_csv('usgs_test_from_id_table.csv')
    usgs_df = usgs_df.rename(columns={'station_nm': 'lake_name'})
    usgs_id_table = id_table.loc[id_table['source'] == 'usgs']
    usgs_id_table = usgs_id_table.loc[usgs_id_table.index.difference(
        usgs_id_table.dropna().index)]
    usgs_df = pd.merge(usgs_df, usgs_id_table, on='lake_name')
    usgs_df = usgs_df.set_index('id_No')
    usgs_df = usgs_df.drop(['metadata'], axis=1)
    usgs_dict = usgs_df.to_json(orient='index')
    usgs_dict = usgs_dict.replace('true', '"true"')
    usgs_dict = usgs_dict.replace('false', '"false"')
    usgs_dict = usgs_dict.replace('null', '"null"')
    usgs_dict = eval(usgs_dict)
    print('USGS metadata prepped')

    # print(usgs_dict)
    print(grealm_dict)
    # print(hydroweb_dict)
    cursor = connection.cursor()
    # Execute mysql commands
    sql_command = u"UPDATE `reference_ID` SET `metadata` = (%s) WHERE `id_No` = (%s);"

    if len(grealm_dict.values()) > 0:
        printProgressBar(0,
                         len(grealm_dict.values()),
                         prefix='G-REALM:',
                         suffix='Complete',
                         length=50)
        for count, (key, value) in enumerate(grealm_dict.items(), 1):
            cursor.execute(sql_command, (json.dumps(value), key))
            printProgressBar(count + 1,
                             len(grealm_dict.values()),
                             prefix='GREALM-USDA:',
                             suffix='Complete',
                             length=50)
        connection.commit()

    if len(hydroweb_dict.values()) > 0:
        printProgressBar(0,
                         len(hydroweb_dict.values()),
                         prefix='HydroWeb:',
                         suffix='Complete',
                         length=50)
        for count, (key, value) in enumerate(hydroweb_dict.items(), 1):
            cursor.execute(sql_command, (json.dumps(value), key))
            printProgressBar(count + 1,
                             len(hydroweb_dict.values()),
                             prefix='HydroWeb:',
                             suffix='Complete',
                             length=50)
        connection.commit()

    if len(usgs_dict.values()) > 0:
        printProgressBar(0,
                         len(usgs_dict.values()),
                         prefix='USGS-NWIS:',
                         suffix='Complete',
                         length=50)
        for count, (key, value) in enumerate(usgs_dict.items(), 1):
            cursor.execute(sql_command, (json.dumps(value), key))
            printProgressBar(count + 1,
                             len(usgs_dict.values()),
                             prefix='USGS-NWIS:',
                             suffix='Complete',
                             length=50)
        connection.commit()

    connection.close()
    sql_engine.close()