def test_get_record_validation(): """Tests the validation parameters of the get_record method""" with pytest.raises(TypeError) as type_error: get_record(sites=['01491000'], service='not_a_service') assert 'Unrecognized service: not_a_service' == str(type_error.value) with pytest.raises(TypeError) as type_error: get_record(sites=['01491000'], service='stat') assert 'stat service not yet implemented' == str(type_error.value)
def update(self, service=None, approved=False): """Update a service Parameters ---------- service : string Name of service to upgrade. If none, upgrade all existing services. approved : boolean TODO: set default approval to True once implemented """ if not service: for service in self.services(): self.update(service=service, approved=approved) elif service not in self._approved_services: raise TypeError("Unrecognized service") elif service == 'site': #site has only one record, so simply update the entire table updated = nwis.get_record(self.id(), service=service) self.put(service, updated) else: site = self.id() old_df = self.get(service) if approved: last_time = old_df.iloc[0].name.strftime('%Y-%m-%d') if not approved: last_time = old_df.iloc[-1].name.strftime('%Y-%m-%d') new_df = nwis.get_record(site, start=last_time, end=None, service=service) if new_df is not None: overlap = new_df.index.intersection(old_df.index) old_df.drop(overlap, axis=0) updated = old_df.append(new_df) self.put(service, updated)
def getstrm_wbs(station_id, end_date): """Get stream flow from https://waterdata.usgs.gov/nwis/. --------------------------------- This function download streamflow. It needs as input the station id number and the end date of data. Dataset start on 1989-01-01. --------------------------------- Parameters: station_id = list of string numbers end_date = string date as yyyy-mm-dd ---------------------------------- Outputs: lastNames = dataframe with streamflow values and dates """ start_date = '1989-01-01' flow_data = nwis.get_record(sites=station_id, service='dv', start=start_date, end=end_date, parameterCd='00060') flow_data.columns = ['flow', 'code', 'site_no'] flow_data = flow_data.rename_axis("datetime") flow_data['datetime'] = pd.to_datetime(flow_data.index) return (flow_data)
def _check_user_input(site, start_date, end_date, data_type, nc_output): # check site try: site_info = nwis.get_record(sites=site, service='site') if site_info.empty: raise ValueError("Incorrect USGS site number.") except Exception: raise ValueError("Incorrect USGS site number.") # check data_type if data_type not in ['dv', 'iv']: raise ValueError( "Incorrect data type: 'dv' as daily value or 'iv' as instantaneous value {} " .format(data_type)) # check time try: start_datetime = datetime.strptime(start_date, '%Y-%m-%d') end_datetime = datetime.strptime(end_date, '%Y-%m-%d') if start_datetime > end_datetime: raise ValueError('Incorrect start date.') except Exception: raise ValueError("Incorrect date format, should be YYYY-MM-DD") # check csv_output if nc_output and nc_output[-3:] != '.nc': raise ValueError('Incorrect NetCDF file path.')
def test_iv_service(): """Unit test of instantaneous value service """ start = START_DATE end = END_DATE service = 'iv' site = ['03339000', '05447500', '03346500'] return get_record(site, start, end, service=service)
def test_measurements_service(): """Test measurement service """ start = '2018-01-24' end = '2018-01-25' service = 'measurements' site = '03339000' df = get_record(site, start, end, service=service) return df
def usgs_data(siteNumber, t1, t2,sel_int = 'HR',parameterCd = '00060'): if (sel_int == 'HR'): service='iv' else: service='dv' data = nwis.get_record(sites=siteNumber, service=service, start=t1, end=t2,parameterCd=parameterCd) site_info, md = nwis.get_info(sites=siteNumber) df = data.iloc[:,0:1] cname = site_info['station_nm'].iloc[0].split(',')[0] df.columns = [cname.title() + ' (USGS)'] df = df.tz_localize(None) df = df.resample('60min').mean() return (df)
def download(self, service, start=None, end=None): """Download Parameters ---------- service : string start : string end : string """ group = self._group(service) try: df = nwis.get_record(self.id(), start=start, end=end, service=service) self.put(service, df) except NoSitesError: print('{} has no data on {}'.format(self.id(), service))
continue query = """SELECT datenew, year, month, day, site_number FROM nwis.groundwater_daily_site_2 WHERE site_number = '{}' ORDER BY year DESC, month DESC, day DESC LIMIT 1""".format( siteNumbers[i]) data = pd.read_sql_query(query, cnx) if (data.empty): continue lastdate = data.iloc[0][0] lastdateobj = datetime.datetime.strptime( lastdate, '%m/%d/%Y') + datetime.timedelta(days=1) lastdatefinal = lastdateobj.strftime('%m/%d/%Y') lastdatelist = lastdatefinal.split("/") lastdatelist = [lastdatelist[2], lastdatelist[0], lastdatelist[1]] lastdatestr = "-".join(lastdatelist) df = nwis.get_record(sites=siteNumbers[i], service='dv', start=lastdatestr, end=date.today()) if (df.empty): continue a_list = df.index.tolist() if (len(a_list) > 0): for i in range(len(a_list)): a_list[i] = str(a_list[i]).replace(" 00:00:00+00:00", "") df.index = a_list newDF = newDF.append(df, ignore_index=False) newDF["date"] = newDF.index newDF[['year', 'month', 'day']] = newDF.date.str.split("-", expand=True) newDF["datecloser"] = newDF['month'].str.cat(newDF['day'], sep="/") newDF["datenew"] = newDF['datecloser'].str.cat(newDF['year'], sep="/") finalDF = newDF[[ "site_no", "72019_Mean", "72019_Mean_cd", "72019_Maximum",
return reg_model_predictions # %% # Step 3: Read in USGS streamflow data and create dataframe of avg weekly flow # Used nwis.get_record function instead of saving a local file # Change stop_date each week station_id = "09506000" USGS_start = "1989-01-01" USGS_stop = "2020-10-24" data_flow = nwis.get_record(sites=station_id, service='dv', start=USGS_start, end=USGS_stop, parameterCd='00060') # Rename columns data_flow.columns = ['flow', 'code', 'site_no'] # Make index a recognized datetime format instead of string data_flow.index = data_flow.index.strftime('%Y-%m-%d') data_flow['datetime'] = pd.to_datetime(data_flow.index) data_flow['year'] = pd.DatetimeIndex(data_flow['datetime']).year data_flow['month'] = pd.DatetimeIndex(data_flow['datetime']).month data_flow['day'] = pd.DatetimeIndex(data_flow['datetime']).day data_flow['dayofweek'] = pd.DatetimeIndex(data_flow['datetime']).dayofweek
ax.plot(prc_mean.index, prc_mean["Precipitation"], color='red', linestyle='--', label='Observed') ax.set(title=" Precipitation since 2000", xlabel="Date", ylabel="Precipitation [mm/week]",yscale='log') ax.legend() fig.savefig("Historical_precip.png") # Save figure # %% Streamflow section # Set the file name and path to where you have stored the data # adjust path as necessary station_id = '09506000' start_date = '1989-01-01' end_date = '2020-10-31' data_flow_day = nwis.get_record(sites=station_id, service='dv', start=start_date, end=end_date, parameterCd='00060') data_flow_day.columns = ['flow', 'code', 'site_no'] # Rename columns data_flow_day.index = data_flow_day.index.strftime('%Y-%m-%d') # Make index a recognized datetime format instead of string # data_flow_day.index = data_flow_day.index.strftime('%Y-%m-%d') # %% Read the data into a pandas dataframe # Expand dates to year month day data_flow_day['datetime'] = pd.to_datetime(data_flow_day.index) data_flow_day['year'] = pd.DatetimeIndex(data_flow_day['datetime']).year data_flow_day['month'] = pd.DatetimeIndex(data_flow_day['datetime']).month data_flow_day['day'] = pd.DatetimeIndex(data_flow_day['datetime']).dayofweek data_flow_day['dayofweek'] = pd.DatetimeIndex(data_flow_day['datetime']).dayofweek # %% AR model that you ended up building
def _get_nwis_data(site, start_date, end_date, data_type, nc_output): variable_info = { '00060': ['discharge', 'cubic feet per second'], '00065': ['gage height', 'feet'], '00010': ['water temperature', 'degree celsius'], '80154': ['Suspended sediment discharge', 'tons per day'], '80155': ['Total sediment discharge', 'tons per day'], '80225': ['Bedload sediment discharge', 'tons per day'] } # get site info site_info = nwis.get_record(sites=site, service='site') # get time series data frame record_df = nwis.get_record(sites=site, service=data_type, start=start_date, end=end_date) filter_names = list(variable_info.keys()) + [ var_name + '_Mean' for var_name in variable_info.keys() ] var_col_names = [ col_name for col_name in record_df.columns if col_name in filter_names ] if record_df.empty or not var_col_names: raise ValueError( 'Time series for discharge variables is not available for site {}.' .format(site)) time_series_df = record_df[var_col_names] time_series_df.columns = [ col_name[:5] for col_name in time_series_df.columns ] # create xarray dataset xr_dataset = time_series_df.to_xarray() # assign datetime data to coordinate xr_dataset['datetime'] = time_series_df.index.values # add site metadata xr_dataset.attrs['site_name'] = site_info.station_nm[0] xr_dataset.attrs['site_code'] = site_info.site_no[0] xr_dataset.attrs['site_latitude'] = site_info.dec_lat_va[0] xr_dataset.attrs['site_longitude'] = site_info.dec_long_va[0] xr_dataset.attrs['site_altitude'] = site_info.alt_va[0] xr_dataset.attrs['site_coord_datum'] = site_info.dec_coord_datum_cd[0] # add variable metadata for var_name in time_series_df.columns: xr_dataset[var_name].attrs['variable_name'] = variable_info[ var_name][0] xr_dataset[var_name].attrs['variable_unit'] = variable_info[ var_name][1] xr_dataset[var_name].attrs['variable_data_type'] = data_type if data_type == 'dv' 'daily value' \ else 'instantaneous value' # save output file as csv file if nc_output: try: xr_dataset.to_netcdf(nc_output) except Exception: print('Failed to write the data in the NetCDF file.') return xr_dataset
url = "https://waterdata.usgs.gov/nwis/dv?cb_00060=on&format=rdb&site_no=09506000" \ "&referred_module=sw&period=&begin_date=1989-01-01&end_date=2020-10-19" #Replace parts of my url with variables site = '09506000' start = '1990-01-01' end = '2020-10-16' url = "https://waterdata.usgs.gov/nwis/dv?cb_00060=on&format=rdb&site_no=" + site + \ "&referred_module=sw&period=&begin_date=" + start + "&end_date=" + end data2 = pd.read_table(url, skiprows=30, names=['agency_cd', 'site_no', 'datetime', 'flow', 'code'], parse_dates=['datetime'], index_col='datetime') # %% obs_day = nwis.get_record(sites=site, service='dv', start=start, end=end, parameterCd='00060') obs_week = np.mean(obs_day['00060_Mean']) # we can look at the package directly # 1) Type conda env list in terminal to see where you environment lives # 2) Navigate to that directory # 3) From that directory go to /libs/pythonxx/site_packages # Open up that folder to see all the packages you have! # %% # Option 3: We can generate this URL and get the data using an API # Technically we were already doing this you just didn't know it # API = Application Programming Interface (Translation - a standard set of appraches/protocols # for working with a given dataset in a predictable way --- rules for accessing data) # Different datasets have their own APIs
# %% # read in the forecast data and setup a dataframe # filepath = os.path.join('..', 'Seasonal_Foercast_Dates.csv') filepath = os.path.join('../weekly_results', 'weekly_observations.csv') print(filepath) obs_table = pd.read_csv(filepath, index_col='forecast_week') # %% # Read in the observations and get weekly averages for i in range(1, week + 1): print(i) starti = obs_table.loc[i, 'start_date'] endi = obs_table.loc[i, 'end_date'] # read in the data from USGS # Read in the streamflow data and get the weekly average obs_day = nwis.get_record(sites=station_id, service='dv', start=starti, end=endi, parameterCd='00060') obs_table.loc[i, 'observed'] = np.round(np.mean(obs_day['00060_Mean']), 3) # %% # Write the updated observations out filepath_out = os.path.join('..', 'weekly_results', 'weekly_observations.csv') obs_table.to_csv(filepath_out, index_label='forecast_week') # %%
#--- https://github.com/USGS-python/dataretrieval #--- https://stackoverflow.com/questions/16176996/keep-only-date-part-when-using-pandas-to-datetime #--- https://stackoverflow.com/questions/50890989/pandas-changing-the-format-of-nan-values-when-saving-to-csv #--- first import the functions for downloading data from NWIS import dataretrieval.nwis as nwis #--- specify the USGS site code for which we want data. stationList = open("USGS - StationIDs.txt").read().splitlines() #--- specify the USGS parameter code for which we want data. parameterList = ['00020','00021','00025','00030','00032','00035','00036','00045','00046','00052','46516','46529','72192','72194','99772','45587','45588','45589','45590',] # get basic info about the site # df = nwis.get_record(sites=stationList, service='site') # df.to_csv(r'C:\Users\Roberto\Documents\Climatología\USGS\export_dataframe_sites_info.csv', header=True) # print(df) df1 = nwis.get_record(stationList, service='dv', start='2019-12-01', end='2019-12-31') #--- Use this if table data import wizard from MySQL Workbench will be used df1.to_csv(r'C:\Users\Roberto\Documents\Climatología\USGS\export_dataframe.csv', date_format='%Y-%m-%d', header=True, na_rep='NULL') #--- Use this one if LOAD DATA INFILE will be used in MySQL # df1.to_csv(r'C:\Users\Roberto\Documents\Climatología\USGS\export_dataframe.csv', date_format='%Y-%m-%d', header=True, na_rep='\\N') print(df1)