def funcP(axP, iP, iM): rr = xMat[:, iM]**2-yMat[:, iM]**2 cc = cMat[:, iM] if cMat.ndim == 2 else cMat dfCrd = gageII.readData( varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst) lat = dfCrd['LAT_GAGE'].values lon = dfCrd['LNG_GAGE'].values # maps axplot.mapPoint(axP[0], lat, lon, rr, vRange=[-0.3, 0.3], s=16, cb=False) circle = plt.Circle([lon[iP], lat[iP]], 2, color='black', fill=False) axP[0].add_patch(circle) axplot.mapPoint(axP[1], lat, lon, cc, vRange=cR, s=16, cb=False) circle = plt.Circle([lon[iP], lat[iP]], 2, color='black', fill=False) axP[1].add_patch(circle) siteNo = siteNoLst[iP] # ts code = codeLst2[iM] print(code, siteNo) print(iP, iM) v0 = dictObs[siteNo][code].values v1 = dictLSTM[siteNo][code].values v2 = dictWRTDS[siteNo][code].values t = dictObs[siteNo].index.values legLst = ['LSTM', 'WRTDS', 'Obs'] axplot.plotTS(axP[2], t[ind1], [v1[ind1], v2[ind1], v0[ind1]], styLst='--*', cLst='rbk', legLst=legLst) axplot.plotTS(axP[3], t[ind2], [v1[ind2], v2[ind2], v0[ind2]], styLst='--*', cLst='rbk', legLst=legLst) # cq q = dictObs[siteNo]['00060'].values c = dictObs[siteNo][code].values td = dictObs[siteNo].index.dayofyear sc = axP[4].scatter(np.log(q), c, c=td, cmap='hsv', vmin=0, vmax=365)
def readSiteY(siteNo, varY, area=None, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2020-01-01')): tr = pd.date_range(sd, ed) dfY = pd.DataFrame({'date': tr}).set_index('date') # extract data codeLst = [code for code in varY if code in usgs.codeLst] dfC, dfCF = usgs.readSample(siteNo, codeLst=codeLst, startDate=sd, flag=True) if '00060' in varY or 'runoff' in varY: dfQ = usgs.readStreamflow(siteNo, startDate=sd) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) if 'runoff' in varY: if area is None: tabArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=[siteNo]) area = tabArea['DRAIN_SQKM'].values[0] dfQ['runoff'] = calRunoffArea(dfQ['00060'], area) dfY = dfY.join(dfQ) dfY = dfY.join(dfC) dfY = dfY.join(dfCF) dfY = dfY[varY] return dfY
def getGeo(self, subsetName=None): siteNoLst = self.getSite(subsetName=subsetName) dfCrd = gageII.readData( varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst) lat = dfCrd['LAT_GAGE'].values lon = dfCrd['LNG_GAGE'].values return lat, lon
def plotP(xx, yy, cc, iP, code): dfCrd = gageII.readData( varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst) lat = dfCrd['LAT_GAGE'].values lon = dfCrd['LNG_GAGE'].values # maps axplot.mapPoint(axP[0], lat, lon, xx, vRange=[-0.5, 1], s=16) circle = plt.Circle([lon[iP], lat[iP]], 2, color='black', fill=False) axP[0].add_patch(circle) axplot.mapPoint(axP[1], lat, lon, yy, vRange=[-0.5, 1], s=16) circle = plt.Circle([lon[iP], lat[iP]], 2, color='black', fill=False) axP[1].add_patch(circle) axplot.mapPoint(axP[2], lat, lon, cc, vRange=cR, s=16) circle = plt.Circle([lon[iP], lat[iP]], 2, color='black', fill=False) axP[2].add_patch(circle) siteNo = siteNoLst[iP] # ts v0 = dictObs[siteNo][code].values v1 = dictLSTM[siteNo][code].values v2 = dictWRTDS[siteNo][code].values t = dictObs[siteNo].index.values legLst = ['LSTM', 'WRTDS', 'Obs'] axplot.plotTS(axP[3], t[ind1], [v1[ind1], v2[ind1], v0[ind1]], styLst='--*', cLst='rbk', legLst=legLst) axplot.plotTS(axP[4], t[ind2], [v1[ind2], v2[ind2], v0[ind2]], styLst='--*', cLst='rbk', legLst=legLst) # cq q = dictObs[siteNo]['00060'].values c = dictObs[siteNo][code].values td = dictObs[siteNo].index.dayofyear sc = axP[5].scatter(np.log(q), c, c=td, cmap='hsv', vmin=0, vmax=365) # figP.colorbar(sc, ax=axP[5]) figP.suptitle('code {} {}; siteNo {} \n corrLSTM {:.2f}; corrWRTDS {:.2f}; {} {}'.format( code, usgs.codePdf.loc[code]['shortName'], siteNo, xx[iP], yy[iP], cVar, cc[iP])) figP.show()
def calRunoff(q, info): siteNoLst = info.siteNo.unique().tolist() dfArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=siteNoLst) dfArea.rename({'STAID': 'siteNo'}) area = info.join(dfArea, on='siteNo')['DRAIN_SQKM'].values runoff = calRunoffArea(q, area) return runoff
def wrapData(caseName, siteNoLst, nFill=5, freq='D', sdStr='1979-01-01', edStr='2019-12-31'): varF = gridMET.varLst varQ = usgs.varQ varG = gageII.lstWaterQuality varC = usgs.newC # gageII tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) tR = pd.date_range(np.datetime64(sdStr), np.datetime64(edStr)) fLst, qLst, gLst, cLst = [list() for x in range(4)] t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() varLst = varQ + varF + varC df = readSiteTS(siteNo, varLst=varLst, freq=freq) # streamflow tempQ = pd.DataFrame({'date': tR}).set_index('date').join(df[varQ]) qLst.append(tempQ.values) # forcings tempF = pd.DataFrame({'date': tR}).set_index('date').join(df[varF]) tempF = tempF.interpolate(limit=nFill, limit_direction='both', limit_area='inside') fLst.append(tempF.values) # # water quality tempC = pd.DataFrame({'date': tR}).set_index('date').join(df[varC]) cLst.append(tempC.values) # geog gLst.append(tabG.loc[siteNo].values) t2 = time.time() print('{} on site {} reading {:.3f} total {:.3f}'.format( i, siteNo, t2 - t1, t2 - t0)) f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32) q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32) c = np.stack(cLst, axis=-1).swapaxes(1, 2).astype(np.float32) g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32) # save saveFolder = caseFolder(caseName) if not os.path.exists(saveFolder): os.mkdir(saveFolder) np.savez_compressed(os.path.join(saveFolder, 'data'), c=c, q=q, f=f, g=g) dictData = dict(name=caseName, varG=varG, varQ=varQ, varF=varF, varC=varC, sd=sdStr, ed=edStr, freq=freq, siteNoLst=siteNoLst) with open(os.path.join(saveFolder, 'info') + '.json', 'w') as fp: json.dump(dictData, fp, indent=4)
def wrapData(caseName, siteNoLst, nFill=5, freq='D', sdStr='1979-01-01', edStr='2019-12-31', varF=gridMET.varLst + ntn.varLst + GLASS.varLst, varQ=usgs.varQ, varG=gageII.varLst, varC=usgs.newC): # gageII tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) tR = pd.date_range(np.datetime64(sdStr), np.datetime64(edStr)) fLst, qLst, gLst, cLst = [list() for x in range(4)] t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() varLst = varQ + varF + varC df = readSiteTS(siteNo, varLst=varLst, freq=freq) # streamflow tempQ = pd.DataFrame({'date': tR}).set_index('date').join(df[varQ]) qLst.append(tempQ.values) # forcings tempF = pd.DataFrame({'date': tR}).set_index('date').join(df[varF]) tempF = tempF.interpolate(limit=nFill, limit_direction='both', limit_area='inside') fLst.append(tempF.values) # # water quality tempC = pd.DataFrame({'date': tR}).set_index('date').join(df[varC]) cLst.append(tempC.values) # geog gLst.append(tabG.loc[siteNo].values) t2 = time.time() print('{} on site {} reading {:.3f} total {:.3f}'.format( i, siteNo, t2 - t1, t2 - t0)) f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32) q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32) c = np.stack(cLst, axis=-1).swapaxes(1, 2).astype(np.float32) g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32) # save saveDataFrame(caseName, c=c, q=q, f=f, g=g, varC=varC, varQ=varQ, varF=varF, varG=varG, sdStr=sdStr, edStr=edStr, freq=freq, siteNoLst=siteNoLst)
def readSiteTS(siteNo, varLst, freq='D', area=None, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2019-12-31'), rmFlag=True): # read data td = pd.date_range(sd, ed) varC = list(set(varLst).intersection(usgs.varC)) varQ = list(set(varLst).intersection(usgs.varQ)) varF = list(set(varLst).intersection(gridMET.varLst)) varP = list(set(varLst).intersection(ntn.varLst)) varR = list(set(varLst).intersection(GLASS.varLst)) varT = list(set(varLst).intersection(varTLst)) dfD = pd.DataFrame({'date': td}).set_index('date') if len(varC) > 0: if rmFlag: dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=sd, flag=2) dfC = usgs.removeFlag(dfC, dfCF) else: dfC = usgs.readSample(siteNo, codeLst=varC, startDate=sd) dfD = dfD.join(dfC) if len(varQ) > 0: dfQ = usgs.readStreamflow(siteNo, startDate=sd) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) if 'runoff' in varLst: if area is None: tabArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=[siteNo]) area = tabArea['DRAIN_SQKM'].values[0] dfQ['runoff'] = calRunoffArea(dfQ['00060'], area) dfD = dfD.join(dfQ) if len(varF) > 0: dfF = gridMET.readBasin(siteNo, varLst=varF) dfD = dfD.join(dfF) if len(varP) > 0: dfP = ntn.readBasin(siteNo, varLst=varP, freq='D') dfD = dfD.join(dfP) if len(varR) > 0: dfR = GLASS.readBasin(siteNo, varLst=varR, freq='D') dfD = dfD.join(dfR) if len(varT) > 0: t = dfD.index.values matT, _ = calT(t) dfT = pd.DataFrame(index=t, columns=varTLst, data=matT) dfD = dfD.join(dfT[varT]) dfD = dfD[varLst] if freq == 'D': return dfD elif freq == 'W': dfW = dfD.resample('W-TUE').mean() return dfW
def funcM(): dfCrd = gageII.readData(varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst) lat = dfCrd['LAT_GAGE'].values lon = dfCrd['LNG_GAGE'].values lat[np.isnan(xMat)] = 9999 lon[np.isnan(xMat)] = 9999 figM, axM = plt.subplots(1, 1, figsize=(12, 4)) axplot.mapPoint(axM, lat, lon, xMat**2 - yMat**2, vRange=[-0.3, 0.3], s=16) axM.set_title('testing Rsq LSTM - Rsq WRTDS') figP = plt.figure(figsize=[16, 6]) axP = list() gsP = gridspec.GridSpec(2, 3) axP.append(figP.add_subplot(gsP[0, :2])) axP.append(figP.add_subplot(gsP[1, :2])) axP.append(figP.add_subplot(gsP[0:, 2])) axP = np.array(axP) return figM, axM, figP, axP, lon, lat
def readSiteX(siteNo, varX, area=None, nFill=5, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2020-01-01')): tr = pd.date_range(sd, ed) dfX = pd.DataFrame({'date': tr}).set_index('date') # extract data dfF = gridMET.readBasin(siteNo) if '00060' in varX or 'runoff' in varX: dfQ = usgs.readStreamflow(siteNo, startDate=sd) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) if 'runoff' in varX: if area is None: tabArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=[siteNo]) area = tabArea['DRAIN_SQKM'].values[0] dfQ['runoff'] = calRunoffArea(dfQ['00060'], area) dfX = dfX.join(dfQ) dfX = dfX.join(dfF) dfX = dfX[varX] dfX = dfX.interpolate(limit=nFill, limit_direction='both') return dfX
fig = figplot.boxPlot(dataBox, label1=labLst1, label2=labLst2) fig.suptitle(title) fig.show() fig.savefig(os.path.join(figFolder, figName)) siteNoLst = wqData.info['siteNo'].unique().tolist() dfHBN = pd.read_csv(os.path.join(kPath.dirData, 'USGS', 'inventory', 'HBN.csv'), dtype={ 'siteNo': str }).set_index('siteNo') siteNoHBN = [siteNo for siteNo in dfHBN.index.tolist() if siteNo in siteNoLst] dropColLst = [ 'STANAME', 'WR_REPORT_REMARKS', 'ADR_CITATION', 'SCREENING_COMMENTS' ] dfX = gageII.readData(siteNoLst=siteNoLst).drop(columns=dropColLst) dfX = gageII.updateCode(dfX) unitConv = 0.3048**3 * 365 * 24 * 60 * 60 / 1000**2 # area vs error indHBN = [siteNoLst.index(siteNo) for siteNo in siteNoHBN] area = dfX['DRAIN_SQKM'].values errMat = errMatLst2[0] code = '00605' # code = '00955' err = errMat[:, wqData.varC.index(code), 1] fig, ax = plt.subplots(1, 1) ax.plot(area, err, 'b*') ax.plot(area[indHBN], err[indHBN], 'r*') # np.nanmedian(err) # np.nanmedian(err[indHBN, :])
def wrapData(caseName, siteNoLst, rho=365, nFill=5, varC=usgs.varC, varG=gageII.lstWaterQuality): """ wrap up input and target data for the model,as: x=[nT,nP,nX] y=[nP,nY] c=[nP,nC] where nP is number of time series Arguments: caseName {str} -- name of current data case siteNoLst {list} -- list of USGS site Keyword Arguments: rho {int} -- [description] (default: {365}) nFill {int} -- max number of continous nan to interpolate in input data (default: {5}) varC {list} -- list of water quality code to learn (default: {usgs.lstCodeSample}) varG {list} -- list of constant variables in gageII (default: {gageII.lstWaterQuality}) varQ and varF are fixed so far """ # add a start/end date to improve efficiency. startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) # gageII tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) # read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY] fLst = list() # forcing ts gLst = list() # geo-const qLst = list() # streamflow cLst = list() # water quality cfLst = list() # water quality flags infoLst = list() t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=startDate, flag=2) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) dfF = gridMET.readBasin(siteNo) for k in range(len(dfC)): ct = dfC.index[k] ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct) if (ctR[0] < startDate) or (ctR[-1] > endDate): continue tempQ = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfQ).interpolate(limit=nFill, limit_direction='both') tempF = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfF).interpolate(limit=nFill, limit_direction='both') qLst.append(tempQ.values) fLst.append(tempF.values) cLst.append(dfC.iloc[k].values) cfLst.append(dfCF.iloc[k].values) gLst.append(tabG.loc[siteNo].values) infoLst.append(dict(siteNo=siteNo, date=ct)) t2 = time.time() print('{} on site {} reading {:.3f} total {:.3f}'.format( i, siteNo, t2 - t1, t2 - t0)) q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32) f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32) g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32) c = np.stack(cLst, axis=-1).swapaxes(0, 1).astype(np.float32) cf = np.stack(cfLst, axis=-1).swapaxes(0, 1).astype(np.float32) infoDf = pd.DataFrame(infoLst) # add runoff runoff = calRunoff(q[:, :, 0], infoDf) q = np.stack([q[:, :, 0], runoff], axis=-1).astype(np.float32) saveFolder = os.path.join(kPath.dirWQ, 'trainData') saveName = os.path.join(saveFolder, caseName) np.savez(saveName, q=q, f=f, c=c, g=g, cf=cf) infoDf.to_csv(saveName + '.csv') dictData = dict(name=caseName, rho=rho, nFill=nFill, varG=varG, varC=varC, varQ=['00060', 'runoff'], varF=gridMET.varLst, siteNoLst=siteNoLst) with open(saveName + '.json', 'w') as fp: json.dump(dictData, fp, indent=4)
from hydroDL.master import basins from hydroDL.data import usgs, gageII from hydroDL.master import slurm from hydroDL.post import axplot, figplot import numpy as np import matplotlib.pyplot as plt import pandas as pd import os import json import sklearn.tree import matplotlib.gridspec as gridspec from sklearn import decomposition # load gageII dfGeo = gageII.readData() dfGeo = gageII.updateCode(dfGeo) dfGeo = gageII.removeField(dfGeo) dirTree = r'C:\Users\geofk\work\waterQuality\C-Q\tree' # count fileSiteNo = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteNoLst-1979') siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist() codeCount = sorted(usgs.codeLst) dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory') countMatAll = np.load(os.path.join(dirInv, 'matCountWeekly.npy')) countMat = np.ndarray([len(siteNoLstAll), len(codeCount)]) for ic, code in enumerate(codeCount): countMat[:, ic] = np.sum(countMatAll[:, :, ic], axis=1) # select site
dictLSTM = dictLSTMLst[0] corrMat = np.full([len(siteNoLst), len(codeLst), 3], np.nan) for ic, code in enumerate(codeLst): for siteNo in dictSite[code]: indS = siteNoLst.index(siteNo) v0 = dictObs[siteNo][code].iloc[ind2].values v1 = dictLSTM[siteNo][code].iloc[ind2].values v2 = dictWRTDS[siteNo][code].iloc[ind2].values rmse1, corr1 = utils.stat.calErr(v1, v0) rmse2, corr2 = utils.stat.calErr(v2, v0) rmse3, corr3 = utils.stat.calErr(v1, v2) corrMat[indS, ic, 0] = corr1 corrMat[indS, ic, 1] = corr2 corrMat[indS, ic, 2] = corr3 dfG = gageII.readData(varLst=None, siteNoLst=siteNoLst) varG = 'DDENS_2009' # plot 121 importlib.reload(axplot) codeLst2 = [ '00095', '00400', '00405', '00600', '00605', '00618', '00660', '00665', '00681', '00915', '00925', '00930', '00935', '00940', '00945', '00950', '00955', '70303', '71846', '80154' ] fig, axes = plt.subplots(5, 4) for k, code in enumerate(codeLst2): j, i = utils.index2d(k, 5, 4) ax = axes[j, i] ic = codeLst.index(code) # x = corrMat[:, ic, 1]
from hydroDL.data import dbCsv from hydroDL.utils import gis, grid from hydroDL.data import usgs, gageII, gridMET, ntn, transform from hydroDL import kPath import time import csv import os import pandas as pd import numpy as np # load sites dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory') fileSiteNo = os.path.join(dirInv, 'siteNoLst-1979') siteNoLst = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist() varLst = ['ECO3_BAS_DOM', 'LAT_GAGE', 'LNG_GAGE', 'CLASS'] dfR = gageII.readData(varLst=varLst, siteNoLst=siteNoLst) dfR = gageII.updateCode(dfR) fileT = os.path.join(gageII.dirTab, 'lookupEco.csv') tabT = pd.read_csv(fileT).set_index('Eco3code') mat = np.full([len(siteNoLst), 3], np.nan) for code in range(1, 85): siteNoTemp = dfR[dfR['ECO3_BAS_DOM'] == code].index ind = [siteNoLst.index(siteNo) for siteNo in siteNoTemp] eco3 = tabT.loc[code]['Eco3'] EcoB1, EcoB2, EcoB3 = eco3.split('.') mat[ind, 0] = EcoB1 mat[ind, 1] = EcoB2 mat[ind, 2] = EcoB3 dfEcoB = pd.DataFrame(index=siteNoLst, columns=['EcoB1', 'EcoB2', 'EcoB3'],
def testModelSeq(outName, siteNoLst, wqData=None, ep=None, returnOut=False, retest=False, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2019-12-31')): # run sequence test for all sites, default to be from first date to last date if type(siteNoLst) is not list: siteNoLst = [siteNoLst] master = loadMaster(outName) if master['crit'] == 'SigmaLoss': doSigma = True else: doSigma = False if ep is None: ep = master['nEpoch'] outDir = nameFolder(outName) sdS = pd.to_datetime(sd).strftime('%Y%m%d') edS = pd.to_datetime(ed).strftime('%Y%m%d') saveDir = os.path.join(outDir, 'seq-{}-{}-ep{}'.format(sdS, edS, ep)) if not os.path.exists(saveDir): os.mkdir(saveDir) siteSaveLst = os.listdir(saveDir) if retest is True: sitePredLst = siteNoLst else: sitePredLst = [ siteNo for siteNo in siteNoLst if siteNo not in siteSaveLst ] if len(sitePredLst) != 0: if wqData is None: wqData = waterQuality.DataModelWQ(master['dataName']) (varX, varXC, varY, varYC) = (master['varX'], master['varXC'], master['varY'], master['varYC']) (statX, statXC, statY, statYC) = loadStat(outName) model = loadModel(outName, ep=ep) tabG = gageII.readData(varLst=varXC, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) for siteNo in sitePredLst: if 'DRAIN_SQKM' in varXC: area = tabG[tabG.index == siteNo]['DRAIN_SQKM'].values[0] else: area = None # test model print('testing {} from {} to {}'.format(siteNo, sdS, edS)) freq = wqData.freq dfX = waterQuality.readSiteTS(siteNo, varX, freq=freq, area=area, sd=sd, ed=ed) # dfX = waterQuality.readSiteX( # siteNo, varX, sd=sd, ed=ed, area=area, nFill=5) xA = np.expand_dims(dfX.values, axis=1) xcA = np.expand_dims(tabG.loc[siteNo].values.astype(np.float), axis=0) mtdX = waterQuality.extractVarMtd(varX) x = transform.transInAll(xA, mtdX, statLst=statX) mtdXC = waterQuality.extractVarMtd(varXC) xc = transform.transInAll(xcA, mtdXC, statLst=statXC) [x, xc] = trainTS.dealNaN([x, xc], master['optNaN'][:2]) yOut = trainTS.testModel(model, x, xc) # transfer out nt = len(dfX) ny = len(varY) if varY is not None else 0 nyc = len(varYC) if varYC is not None else 0 if doSigma: yP = np.full([nt, ny + nyc], np.nan) sP = np.full([nt, ny + nyc], np.nan) yP[:, :ny] = wqData.transOut(yOut[:, 0, :ny * 2:2], statY, varY) yP[:, ny:] = wqData.transOut(yOut[:, 0, ny * 2::2], statYC, varYC) sP[:, :ny] = wqData.transOut( np.sqrt(np.exp(yOut[:, 0, 1:ny * 2:2])), statY, varY) sP[:, ny:] = wqData.transOut( np.sqrt(np.exp(yOut[:, 0, ny * 2 + 1::2])), statYC, varYC) else: yP = np.full([nt, ny + nyc], np.nan) yP[:, :ny] = wqData.transOut(yOut[:, 0, :ny], statY, varY) yP[:, ny:] = wqData.transOut(yOut[:, 0, ny:], statYC, varYC) # save output t = dfX.index.values.astype('datetime64[D]') colY = [] if varY is None else varY colYC = [] if varYC is None else varYC dfOut = pd.DataFrame(data=yP, columns=[colY + colYC], index=t) dfOut.index.name = 'date' dfOut = dfOut.reset_index() dfOut.to_csv(os.path.join(saveDir, siteNo), index=False) if doSigma: dfOutS = pd.DataFrame(data=sP, columns=[colY + colYC], index=t) dfOutS.index.name = 'date' dfOutS = dfOut.reset_index() dfOutS.to_csv(os.path.join(saveDir, siteNo + '_sigma'), index=False) # load all csv if returnOut: dictOut = dict() for siteNo in siteNoLst: # print('loading {} from {} to {}'.format(siteNo, sdS, edS)) dfOut = pd.read_csv(os.path.join(saveDir, siteNo)) dictOut[siteNo] = dfOut if doSigma: dfOut = pd.read_csv(os.path.join(saveDir, siteNo + '_sigma')) dictOut[siteNo + '_sigma'] = dfOut return dictOut
from hydroDL import kPath from hydroDL.app import waterQuality from hydroDL.data import gageII import pandas as pd import numpy as np import os import time # all gages fileSiteNo = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteNoLst-1979') siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist() tabSel = gageII.readData( varLst=['CLASS'], siteNoLst=siteNoLstAll) tabSel = gageII.updateCode(tabSel) siteNoLst = tabSel[tabSel['CLASS'] == 1].index.tolist() # wqData = waterQuality.DataModelWQ.new('basinRef', siteNoLst) wqData = waterQuality.DataModelWQ('basinRef') # indYr1 = waterQuality.indYr(wqData.info, yrLst=[1979, 2000])[0] # wqData.saveSubset('Y8090', indYr1) # indYr2 = waterQuality.indYr(wqData.info, yrLst=[2000, 2020])[0] # wqData.saveSubset('Y0010', indYr2) indYrO, indYrE = waterQuality.indYrOddEven(wqData.info) wqData.saveSubset('Yodd', indYrO) wqData.saveSubset('Yeven', indYrE)
[dfC.set_index('date').dropna(how='all'), dfQ.set_index('date')], axis=1, join='inner') dictData[siteNo] = pdf print('\t {}/{} {:.2f}'.format(i, len(siteNoLst), time.time() - t0), end='\r') fileName = os.path.join(kPath.dirWQ, 'tempData', 'CQall') pickle.dump(dictData, open(fileName, 'wb')) else: dictData = pickle.load(open(fileName, 'rb')) print('read all C-Q data {:.2f}'.format(time.time() - t0)) # calculate slope pdfArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=siteNoLst) unitConv = 0.3048**3 * 365 * 24 * 60 * 60 / 1000**2 codeLst = waterQuality.codeLst # codeLst = ['00955', '00940', '00915'] nSite = len(siteNoLst) codeQ = '00060_00003' pMat = np.full([nSite, len(codeLst), 4], np.nan) nMat = np.full([nSite, len(codeLst)], np.nan) t0 = time.time() for i, codeC in enumerate(codeLst): for j, siteNo in enumerate(siteNoLst): pdf = dictData[siteNo][[codeC, codeQ]].dropna() if len(pdf.index) > 10: area = pdfArea.loc[siteNo].values[0] q = pdf[codeQ].values / area * unitConv
varPLst = ['ph', 'Conduc', 'Ca', 'Mg', 'K', 'Na', 'NH4', 'NO3', 'Cl', 'SO4'] dfP = pd.DataFrame(columns=varPLst) for k in range(len(tab)): t1 = pd.to_datetime(tab.iloc[k]['dateon']).date() t2 = pd.to_datetime(tab.iloc[k]['dateoff']).date() tt = pd.date_range(t1, t2)[:-1] data = np.tile(tab.iloc[k][varPLst].values, [len(tt), 1]) tabTemp = pd.DataFrame(index=tt, columns=varPLst, data=data) dfP = dfP.append(tabTemp) dfP.dropna(how='all') startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) # gageII tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) # read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY] fLst = list() # forcing ts gLst = list() # geo-const qLst = list() # streamflow cLst = list() # water quality cfLst = list() # water quality flags infoLst = list() t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=startDate,
from hydroDL import kPath from hydroDL.app import waterQuality from hydroDL.master import basins from hydroDL.data import usgs, gageII, gridMET, ntn import numpy as np import pandas as pd import json import os regionLst = ['ECO2_BAS_DOM', 'NUTR_BAS_DOM', 'HLR_BAS_DOM_100M', 'PNV_BAS_DOM'] dfG = gageII.readData(varLst=regionLst+['LAT_GAGE', 'LNG_GAGE', 'CLASS']) # deal with PNV fileT = os.path.join(gageII.dirTab, 'lookupPNV.csv') tabT = pd.read_csv(fileT).set_index('PNV_CODE') for code in range(1, 63): siteNoTemp = dfG[dfG['PNV_BAS_DOM'] == code].index dfG.at[siteNoTemp, 'PNV_BAS_DOM2'] = tabT.loc[code]['PNV_CLASS_CODE'] dictName = { 'PNV': 'PNV_BAS_DOM2', 'NUTR': 'NUTR_BAS_DOM', 'HLR': 'HLR_BAS_DOM_100M', 'ECO': 'ECO2_BAS_DOM'} dictRegion = { 'PNV': [2, 3, 4, 5, 9, 11], 'NUTR': [2, 3, 4, 5, 6, 7, 8, 9, 11, 14], 'HLR': [3, 6, 7, 8, 9, 11, 12, 13, 16, 17, 18, 20], 'ECO': [5.3, 6.2, 8.1, 8.2, 8.3, 8.4, 9.2, 9.3, 9.4, 10.1, 11.1]
nh = 256 batchSize = [365, 50] # if not waterQuality.exist(siteNo): # wqData = waterQuality.DataModelWQ.new(siteNo, [siteNo]) wqData = waterQuality.DataModelWQ(siteNo, rmFlag=False) varX = wqData.varF varXC = wqData.varG varY = [wqData.varQ[0]] varYC = codeLst varTup = (varX, varXC, varY, varYC) dataTup, statTup = wqData.transIn(varTup=varTup) dataTup = trainTS.dealNaN(dataTup, [1, 1, 0, 0]) sizeLst = trainTS.getSize(dataTup) [nx, nxc, ny, nyc, nt, ns] = sizeLst tabG = gageII.readData(varLst=varXC, siteNoLst=[siteNo]) tabG = gageII.updateCode(tabG) dfX = waterQuality.readSiteX(siteNo, varX, nFill=5) dfY = waterQuality.readSiteY(siteNo, varY) dfYC = waterQuality.readSiteY(siteNo, varYC) importlib.reload(rnn) model = rnn.AgeLSTM(nx=nx + nxc, ny=ny, nyc=nyc, nh=nh) optim = torch.optim.Adadelta(model.parameters()) lossFun = crit.RmseMix() if torch.cuda.is_available(): lossFun = lossFun.cuda() model = model.cuda() # train model.train()
# load WRTDS results dirRoot1 = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS_weekly') dirRoot2 = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS_weekly_rmq') code = '00955' dfRes1 = pd.read_csv(os.path.join(dirRoot1, 'result', code), dtype={ 'siteNo': str }).set_index('siteNo') dfRes2 = pd.read_csv(os.path.join(dirRoot2, 'result', code), dtype={ 'siteNo': str }).set_index('siteNo') # dfRes1[dfRes1 == -9999] = np.nan dfGeo = gageII.readData(siteNoLst=dfRes1.index.tolist()) dfGeo = gageII.updateCode(dfGeo) # select sites nS = 200 dfR1 = dfRes1[dfRes1['count'] > nS] siteNoLst = dfR1.index.tolist() dfR2 = dfRes2.loc[siteNoLst] dfG = dfGeo.loc[siteNoLst] varGLst = dfG.columns.tolist() dfRsq = pd.DataFrame(index=varGLst, columns=['Rsq1', 'Rsq2']) for varG in varGLst: x = dfG[varG].values y1 = dfR1['corr'].values y2 = dfR1['corr'].values
v3 = dictObs[siteNo][code].iloc[indT2].values vv1, vv2, vv3 = utils.rmNan([v1, v2, v3], returnInd=False) rmse1, corr1 = utils.stat.calErr(vv1, vv2) rmse2, corr2 = utils.stat.calErr(vv1, vv3) rmse3, corr3 = utils.stat.calErr(vv2, vv3) corrMat[indS, ic, 0] = corr1 corrMat[indS, ic, 1] = corr2 corrMat[indS, ic, 2] = corr3 rmseMat[indS, ic, 0] = rmse1 rmseMat[indS, ic, 1] = rmse2 rmseMat[indS, ic, 2] = rmse3 # load basin attributes regionLst = ['ECO2_BAS_DOM', 'NUTR_BAS_DOM', 'HLR_BAS_DOM_100M', 'PNV_BAS_DOM'] dfG = gageII.readData(siteNoLst=siteNoLst) fileT = os.path.join(gageII.dirTab, 'lookupPNV.csv') tabT = pd.read_csv(fileT).set_index('PNV_CODE') for code in range(1, 63): siteNoTemp = dfG[dfG['PNV_BAS_DOM'] == code].index dfG.at[siteNoTemp, 'PNV_BAS_DOM2'] = tabT.loc[code]['PNV_CLASS_CODE'] dfG = gageII.updateCode(dfG) # calculate LombScargle pMat = np.full([len(siteNoLst), len(codeLst)], np.nan) for ic, code in enumerate(codeLst): for siteNo in dictSite[code]: indS = siteNoLst.index(siteNo) df = dictObs[siteNo] t = np.arange(len(df))*7 y = df[code]
from hydroDL.data import gageII, usgs, gridMET from hydroDL import kPath, utils import os import pandas as pd import numpy as np from hydroDL import kPath fileSiteNo = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteNoLst-1979') siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist() # all gages dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory') fileSiteNo = os.path.join(dirInv, 'siteNoLst-1979') siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist() codeLst = sorted(usgs.newC) dfCrd = gageII.readData(varLst=['LAT_GAGE', 'LNG_GAGE', 'CLASS'], siteNoLst=siteNoLstAll) dfCrd = gageII.updateCode(dfCrd) sd = np.datetime64('1979-01-01') # load all data dictC = dict() dictCF = dict() for k, siteNo in enumerate(siteNoLstAll): print(k, siteNo) dfC, dfCF = usgs.readSample(siteNo, codeLst=codeLst, startDate=sd, flag=2) dictC[siteNo] = dfC dictCF[siteNo] = dfCF dictQ = dict() for k, siteNo in enumerate(siteNoLstAll): print(k, siteNo) dfQ = usgs.readStreamflow(siteNo, startDate=sd)
codeLst = sorted(usgs.newC) ep = 500 reTest = False dataName = 'rbWN5' siteNoLst = dictSite['comb'] nSite = len(siteNoLst) # load all sequence if False: importlib.reload(wq.wqLoad) outNameLSTM = '{}-{}-{}-{}'.format('rbWN5', 'comb', 'QTFP_C', 'comb-B10') dictLSTM, dictWRTDS, dictObs = wq.loadModel( siteNoLst, outNameLSTM, codeLst) corrMat, rmseMat = wq.dictErr(dictLSTM, dictWRTDS, dictObs, codeLst) # load basin attributes dfG = gageII.readData(siteNoLst=siteNoLst) dfG = gageII.updateRegion(dfG) dfG = gageII.updateCode(dfG) t = dictObs[siteNoLst[0]].index.values tt = np.datetime64('2010-01-01') t0 = np.datetime64('1980-01-01') ind1 = np.where((t < tt) & (t >= t0))[0] ind2 = np.where(t >= tt)[0] # caluculate interval if False: intMatC = np.full([len(siteNoLst), len(codeLst), 4], np.nan) for k, siteNo in enumerate(siteNoLst): dfC = dictObs[siteNo] print('\t {}/{}'.format(k, len(siteNoLst)), end='\r')
codeLst = ['00915', '00945', '00955'] tempLst = list() for code in codeLst: temp = dfAll[dfAll[code] > 200].index.tolist() tempLst.append(temp) siteNoLst = tempLst[0] for k in range(1, len(tempLst)): siteNoLst = list(set(siteNoLst).intersection(tempLst[k])) startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) nc = len(codeLst) ns = len(siteNoLst) # cal dw rMat = np.ndarray([ns, nc]) pdfArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=siteNoLst) unitConv = 0.3048**3 * 365 * 24 * 60 * 60 / 1000**2 for k, siteNo in enumerate(siteNoLst): for i, code in enumerate(codeLst): area = pdfArea.loc[siteNo]['DRAIN_SQKM'] dfC = usgs.readSample(siteNo, codeLst=codeLst, startDate=startDate) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) df = dfC.join(dfQ) t = df.index.values q = df['00060_00003'].values / area * unitConv c = df[code].values (q, c), ind = utils.rmNan([q, c]) x = 10**np.linspace(np.log10(np.min(q[q > 0])), np.log10(np.max(q[~np.isnan(q)])), 20) ceq, dw, y = wqRela.kateModel(q, c, q) corr = np.corrcoef(c, y)[0, 1]
# pick pickMat = (count >= 400) len(np.where(pickMat)[0]) indS = np.where(pickMat)[0] dictSite = dict() siteNoSel = [siteNoLst[ind] for ind in indS] siteNoSel = [ '01184000', '01434025', '01435000', '01466500', '04063700', '06313500', '06317000', '06324500', '09163500', '09352900', '11264500', '401733105392404' ] indS = [siteNoLst.index(siteNo) for siteNo in siteNoSel] dictSite['k12'] = siteNoSel dfCrd = gageII.readData(siteNoLst=siteNoSel, varLst=['DRAIN_SQKM', 'LNG_GAGE', 'LAT_GAGE']) lat = dfCrd['LAT_GAGE'].values lon = dfCrd['LNG_GAGE'].values area = dfCrd['DRAIN_SQKM'].values nc = len(codeSel) def funcM(): figM, axM = plt.subplots(2, 1, figsize=(6, 4)) axplot.mapPoint(axM[0], lat, lon, area, s=16, cb=True) axplot.mapPoint(axM[1], lat, lon, count[indS], s=16, cb=True) figP, axP = plt.subplots(nc, 1, figsize=(12, 8)) return figM, axM, figP, axP, lon, lat def funcP(iP, axP):
from hydroDL.data import gageII import numpy as np import pandas as pd import os varLst = ['ECO2_BAS_DOM', 'ECO3_BAS_DOM'] dfR = gageII.readData(varLst=varLst) dfR = gageII.updateCode(dfR) fileEco3 = r'C:\Users\geofk\work\map\ecoRegion\tabEco3.csv' tabEco3 = pd.read_csv(fileEco3) fileLookup = os.path.join(gageII.dirTab, 'conterm_x_ecoregion3_names.csv') tabLookup = pd.read_csv(fileLookup) len(np.sort(dfR['ECO3_BAS_DOM'].unique())) codeLst = list(range(1, 85)) dfT = pd.DataFrame(index=codeLst, columns=['Eco2', 'Eco3', 'Eco3_Name']) for code in codeLst: eco2 = dfR[dfR['ECO3_BAS_DOM'] == code]['ECO2_BAS_DOM'].unique() eco3Name = tabLookup[tabLookup['ECO3_CODE'] == code]['ECO3_NAME'].values if len(eco3Name) == 1: eco3 = tabEco3[tabEco3['NA_L3NAME'] == eco3Name[0]]['NA_L3CODE'].values dfT.at[code, 'Eco3_Name'] = eco3Name[0] if len(eco2) == 1: dfT.at[code, 'Eco2'] = eco2[0] if len(eco3) == 1: dfT.at[code, 'Eco3'] = eco3[0] fileT = os.path.join(gageII.dirTab, 'EcoTab.csv') dfT.to_csv(fileT)
fig.show() if 'plotTsMap' in doLst: # plot map iCLst = [0, 11] tempLst = [npfLst[0]['matRmse2'][:, iC] for iC in iCLst] temp = np.sum(tempLst, axis=0) indG = np.where(~np.isnan(temp))[0].tolist() npf = npfLst[0] dataLst = [npf['matRmse2'][indG, iC] for iC in iCLst] dataNLst = [npf['matN2'][indG, iC] for iC in iCLst] mapTitleLst = ['RMSE of ' + codePdf['shortName'][varC[iC]] for iC in iCLst] siteNoLstTemp = [siteNoLst[i] for i in indG] dfCrd = gageII.readData( varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLstTemp) lat = dfCrd['LAT_GAGE'].values lon = dfCrd['LNG_GAGE'].values nTs = len(iCLst) nMap = len(dataLst) gsR = nTs figsize = [12, 8] # setup axes fig = plt.figure(figsize=figsize) gs = gridspec.GridSpec(gsR + nTs, nMap) gs.update(wspace=0.025, hspace=0.5) axTsLst = list() for k in range(nTs): axTs = fig.add_subplot(gs[k + gsR, :]) axTsLst.append(axTs) for k in range(nMap):
# ts map of single dataset, label and code freq = 'W' dirRoot1 = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS_weekly') dirRoot2 = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS_weekly_rmq') code = '00955' dfRes1 = pd.read_csv(os.path.join(dirRoot1, 'result', code), dtype={ 'siteNo': str }).set_index('siteNo') dfRes2 = pd.read_csv(os.path.join(dirRoot2, 'result', code), dtype={ 'siteNo': str }).set_index('siteNo') dfGeo = gageII.readData(siteNoLst=dfRes1.index.tolist()) dfGeo = gageII.updateCode(dfGeo) # select number of sites countS = np.sort(dfRes1['count'].values)[::-1] fig, ax = plt.subplots(1, 1) ax.plot(np.arange(len(countS)), countS, '-*') fig.show() # plot map nS = 200 dfR1 = dfRes1[dfRes1['count'] > nS] siteNoLst = dfR1.index.tolist() dfR2 = dfRes2.loc[siteNoLst] dfG = dfGeo.loc[siteNoLst]