def wrapData(caseName, siteNoLst, nFill=5, freq='D', sdStr='1979-01-01', edStr='2019-12-31'): varF = gridMET.varLst varQ = usgs.varQ varG = gageII.lstWaterQuality varC = usgs.newC # gageII tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) tR = pd.date_range(np.datetime64(sdStr), np.datetime64(edStr)) fLst, qLst, gLst, cLst = [list() for x in range(4)] t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() varLst = varQ + varF + varC df = readSiteTS(siteNo, varLst=varLst, freq=freq) # streamflow tempQ = pd.DataFrame({'date': tR}).set_index('date').join(df[varQ]) qLst.append(tempQ.values) # forcings tempF = pd.DataFrame({'date': tR}).set_index('date').join(df[varF]) tempF = tempF.interpolate(limit=nFill, limit_direction='both', limit_area='inside') fLst.append(tempF.values) # # water quality tempC = pd.DataFrame({'date': tR}).set_index('date').join(df[varC]) cLst.append(tempC.values) # geog gLst.append(tabG.loc[siteNo].values) t2 = time.time() print('{} on site {} reading {:.3f} total {:.3f}'.format( i, siteNo, t2 - t1, t2 - t0)) f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32) q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32) c = np.stack(cLst, axis=-1).swapaxes(1, 2).astype(np.float32) g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32) # save saveFolder = caseFolder(caseName) if not os.path.exists(saveFolder): os.mkdir(saveFolder) np.savez_compressed(os.path.join(saveFolder, 'data'), c=c, q=q, f=f, g=g) dictData = dict(name=caseName, varG=varG, varQ=varQ, varF=varF, varC=varC, sd=sdStr, ed=edStr, freq=freq, siteNoLst=siteNoLst) with open(os.path.join(saveFolder, 'info') + '.json', 'w') as fp: json.dump(dictData, fp, indent=4)
def wrapData(caseName, siteNoLst, nFill=5, freq='D', sdStr='1979-01-01', edStr='2019-12-31', varF=gridMET.varLst + ntn.varLst + GLASS.varLst, varQ=usgs.varQ, varG=gageII.varLst, varC=usgs.newC): # gageII tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) tR = pd.date_range(np.datetime64(sdStr), np.datetime64(edStr)) fLst, qLst, gLst, cLst = [list() for x in range(4)] t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() varLst = varQ + varF + varC df = readSiteTS(siteNo, varLst=varLst, freq=freq) # streamflow tempQ = pd.DataFrame({'date': tR}).set_index('date').join(df[varQ]) qLst.append(tempQ.values) # forcings tempF = pd.DataFrame({'date': tR}).set_index('date').join(df[varF]) tempF = tempF.interpolate(limit=nFill, limit_direction='both', limit_area='inside') fLst.append(tempF.values) # # water quality tempC = pd.DataFrame({'date': tR}).set_index('date').join(df[varC]) cLst.append(tempC.values) # geog gLst.append(tabG.loc[siteNo].values) t2 = time.time() print('{} on site {} reading {:.3f} total {:.3f}'.format( i, siteNo, t2 - t1, t2 - t0)) f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32) q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32) c = np.stack(cLst, axis=-1).swapaxes(1, 2).astype(np.float32) g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32) # save saveDataFrame(caseName, c=c, q=q, f=f, g=g, varC=varC, varQ=varQ, varF=varF, varG=varG, sdStr=sdStr, edStr=edStr, freq=freq, siteNoLst=siteNoLst)
# WRTDS corr dirWrtds = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS-D', 'All') fileC = os.path.join(dirWrtds, 'corr') dfCorr = pd.read_csv(fileC, dtype={'siteNo': str}).set_index('siteNo') code = '00915' codeName = usgs.codePdf.loc[code]['shortName'] # load WRTDS par fileP = os.path.join(dirWrtds, 'params', code) dfPar = pd.read_csv(fileP, dtype={'siteNo': str}).set_index('siteNo') # select site by count n = 40 * 2 dfParSel = dfPar[dfPar['count'] > n] siteNoLst = dfParSel.index.tolist() dfCorrSel = dfCorr.loc[siteNoLst][code] dfCrd = gageII.readData(varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst) dfCrd = gageII.updateCode(dfCrd) lat = dfCrd['LAT_GAGE'].values lon = dfCrd['LNG_GAGE'].values # plot map parLst = ['pQ', 'pSinT', 'pCosT', 'pYr', 'b'] figM, axM = plt.subplots(3, 2, figsize=(12, 16)) axplot.mapPoint(axM[0, 0], lat, lon, dfCorrSel.values, s=16) axM[0, 0].set_title('WRTDS corr {}'.format(codeName)) for k, par in enumerate(parLst): iy, ix = utils.index2d(k + 1, 3, 2) axplot.mapPoint(axM[iy, ix], lat, lon, dfParSel[par].values, s=16) axM[iy, ix].set_title('WRTDS {} {}'.format(par, codeName)) figM.show()
def testModelSeq(outName, siteNoLst, wqData=None, ep=None, returnOut=False, retest=False, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2019-12-31')): # run sequence test for all sites, default to be from first date to last date if type(siteNoLst) is not list: siteNoLst = [siteNoLst] master = loadMaster(outName) if master['crit'] == 'SigmaLoss': doSigma = True else: doSigma = False if ep is None: ep = master['nEpoch'] outDir = nameFolder(outName) sdS = pd.to_datetime(sd).strftime('%Y%m%d') edS = pd.to_datetime(ed).strftime('%Y%m%d') saveDir = os.path.join(outDir, 'seq-{}-{}-ep{}'.format(sdS, edS, ep)) if not os.path.exists(saveDir): os.mkdir(saveDir) siteSaveLst = os.listdir(saveDir) if retest is True: sitePredLst = siteNoLst else: sitePredLst = [ siteNo for siteNo in siteNoLst if siteNo not in siteSaveLst ] if len(sitePredLst) != 0: if wqData is None: wqData = waterQuality.DataModelWQ(master['dataName']) (varX, varXC, varY, varYC) = (master['varX'], master['varXC'], master['varY'], master['varYC']) (statX, statXC, statY, statYC) = loadStat(outName) model = loadModel(outName, ep=ep) tabG = gageII.readData(varLst=varXC, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) for siteNo in sitePredLst: if 'DRAIN_SQKM' in varXC: area = tabG[tabG.index == siteNo]['DRAIN_SQKM'].values[0] else: area = None # test model print('testing {} from {} to {}'.format(siteNo, sdS, edS)) freq = wqData.freq dfX = waterQuality.readSiteTS(siteNo, varX, freq=freq, area=area, sd=sd, ed=ed) # dfX = waterQuality.readSiteX( # siteNo, varX, sd=sd, ed=ed, area=area, nFill=5) xA = np.expand_dims(dfX.values, axis=1) xcA = np.expand_dims(tabG.loc[siteNo].values.astype(np.float), axis=0) mtdX = waterQuality.extractVarMtd(varX) x = transform.transInAll(xA, mtdX, statLst=statX) mtdXC = waterQuality.extractVarMtd(varXC) xc = transform.transInAll(xcA, mtdXC, statLst=statXC) [x, xc] = trainTS.dealNaN([x, xc], master['optNaN'][:2]) yOut = trainTS.testModel(model, x, xc) # transfer out nt = len(dfX) ny = len(varY) if varY is not None else 0 nyc = len(varYC) if varYC is not None else 0 if doSigma: yP = np.full([nt, ny + nyc], np.nan) sP = np.full([nt, ny + nyc], np.nan) yP[:, :ny] = wqData.transOut(yOut[:, 0, :ny * 2:2], statY, varY) yP[:, ny:] = wqData.transOut(yOut[:, 0, ny * 2::2], statYC, varYC) sP[:, :ny] = wqData.transOut( np.sqrt(np.exp(yOut[:, 0, 1:ny * 2:2])), statY, varY) sP[:, ny:] = wqData.transOut( np.sqrt(np.exp(yOut[:, 0, ny * 2 + 1::2])), statYC, varYC) else: yP = np.full([nt, ny + nyc], np.nan) yP[:, :ny] = wqData.transOut(yOut[:, 0, :ny], statY, varY) yP[:, ny:] = wqData.transOut(yOut[:, 0, ny:], statYC, varYC) # save output t = dfX.index.values.astype('datetime64[D]') colY = [] if varY is None else varY colYC = [] if varYC is None else varYC dfOut = pd.DataFrame(data=yP, columns=[colY + colYC], index=t) dfOut.index.name = 'date' dfOut = dfOut.reset_index() dfOut.to_csv(os.path.join(saveDir, siteNo), index=False) if doSigma: dfOutS = pd.DataFrame(data=sP, columns=[colY + colYC], index=t) dfOutS.index.name = 'date' dfOutS = dfOut.reset_index() dfOutS.to_csv(os.path.join(saveDir, siteNo + '_sigma'), index=False) # load all csv if returnOut: dictOut = dict() for siteNo in siteNoLst: # print('loading {} from {} to {}'.format(siteNo, sdS, edS)) dfOut = pd.read_csv(os.path.join(saveDir, siteNo)) dictOut[siteNo] = dfOut if doSigma: dfOut = pd.read_csv(os.path.join(saveDir, siteNo + '_sigma')) dictOut[siteNo + '_sigma'] = dfOut return dictOut
reTest = False dataName = 'rbWN5' siteNoLst = dictSite['comb'] nSite = len(siteNoLst) # load all sequence if False: importlib.reload(wq.wqLoad) outNameLSTM = '{}-{}-{}-{}'.format('rbWN5', 'comb', 'QTFP_C', 'comb-B10') dictLSTM, dictWRTDS, dictObs = wq.loadModel( siteNoLst, outNameLSTM, codeLst) corrMat, rmseMat = wq.dictErr(dictLSTM, dictWRTDS, dictObs, codeLst) # load basin attributes dfG = gageII.readData(siteNoLst=siteNoLst) dfG = gageII.updateRegion(dfG) dfG = gageII.updateCode(dfG) t = dictObs[siteNoLst[0]].index.values tt = np.datetime64('2010-01-01') t0 = np.datetime64('1980-01-01') ind1 = np.where((t < tt) & (t >= t0))[0] ind2 = np.where(t >= tt)[0] # caluculate interval if False: intMatC = np.full([len(siteNoLst), len(codeLst), 4], np.nan) for k, siteNo in enumerate(siteNoLst): dfC = dictObs[siteNo] print('\t {}/{}'.format(k, len(siteNoLst)), end='\r') for j, code in enumerate(codeLst): tC = dfC.iloc[ind1][code].dropna().index.values
dfP = pd.DataFrame(columns=varPLst) for k in range(len(tab)): t1 = pd.to_datetime(tab.iloc[k]['dateon']).date() t2 = pd.to_datetime(tab.iloc[k]['dateoff']).date() tt = pd.date_range(t1, t2)[:-1] data = np.tile(tab.iloc[k][varPLst].values, [len(tt), 1]) tabTemp = pd.DataFrame(index=tt, columns=varPLst, data=data) dfP = dfP.append(tabTemp) dfP.dropna(how='all') startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) # gageII tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) # read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY] fLst = list() # forcing ts gLst = list() # geo-const qLst = list() # streamflow cLst = list() # water quality cfLst = list() # water quality flags infoLst = list() t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=startDate, flag=2)
def wrapData(caseName, siteNoLst, rho=365, nFill=5, varC=usgs.varC, varG=gageII.lstWaterQuality): """ wrap up input and target data for the model,as: x=[nT,nP,nX] y=[nP,nY] c=[nP,nC] where nP is number of time series Arguments: caseName {str} -- name of current data case siteNoLst {list} -- list of USGS site Keyword Arguments: rho {int} -- [description] (default: {365}) nFill {int} -- max number of continous nan to interpolate in input data (default: {5}) varC {list} -- list of water quality code to learn (default: {usgs.lstCodeSample}) varG {list} -- list of constant variables in gageII (default: {gageII.lstWaterQuality}) varQ and varF are fixed so far """ # add a start/end date to improve efficiency. startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) # gageII tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) # read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY] fLst = list() # forcing ts gLst = list() # geo-const qLst = list() # streamflow cLst = list() # water quality cfLst = list() # water quality flags infoLst = list() t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=startDate, flag=2) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) dfF = gridMET.readBasin(siteNo) for k in range(len(dfC)): ct = dfC.index[k] ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct) if (ctR[0] < startDate) or (ctR[-1] > endDate): continue tempQ = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfQ).interpolate(limit=nFill, limit_direction='both') tempF = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfF).interpolate(limit=nFill, limit_direction='both') qLst.append(tempQ.values) fLst.append(tempF.values) cLst.append(dfC.iloc[k].values) cfLst.append(dfCF.iloc[k].values) gLst.append(tabG.loc[siteNo].values) infoLst.append(dict(siteNo=siteNo, date=ct)) t2 = time.time() print('{} on site {} reading {:.3f} total {:.3f}'.format( i, siteNo, t2 - t1, t2 - t0)) q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32) f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32) g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32) c = np.stack(cLst, axis=-1).swapaxes(0, 1).astype(np.float32) cf = np.stack(cfLst, axis=-1).swapaxes(0, 1).astype(np.float32) infoDf = pd.DataFrame(infoLst) # add runoff runoff = calRunoff(q[:, :, 0], infoDf) q = np.stack([q[:, :, 0], runoff], axis=-1).astype(np.float32) saveFolder = os.path.join(kPath.dirWQ, 'trainData') saveName = os.path.join(saveFolder, caseName) np.savez(saveName, q=q, f=f, c=c, g=g, cf=cf) infoDf.to_csv(saveName + '.csv') dictData = dict(name=caseName, rho=rho, nFill=nFill, varG=varG, varC=varC, varQ=['00060', 'runoff'], varF=gridMET.varLst, siteNoLst=siteNoLst) with open(saveName + '.json', 'w') as fp: json.dump(dictData, fp, indent=4)
# ts map of single dataset, label and code freq = 'W' dirRoot1 = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS_weekly') dirRoot2 = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS_weekly_rmq') code = '00955' dfRes1 = pd.read_csv(os.path.join(dirRoot1, 'result', code), dtype={ 'siteNo': str }).set_index('siteNo') dfRes2 = pd.read_csv(os.path.join(dirRoot2, 'result', code), dtype={ 'siteNo': str }).set_index('siteNo') dfGeo = gageII.readData(siteNoLst=dfRes1.index.tolist()) dfGeo = gageII.updateCode(dfGeo) # select number of sites countS = np.sort(dfRes1['count'].values)[::-1] fig, ax = plt.subplots(1, 1) ax.plot(np.arange(len(countS)), countS, '-*') fig.show() # plot map nS = 200 dfR1 = dfRes1[dfRes1['count'] > nS] siteNoLst = dfR1.index.tolist() dfR2 = dfRes2.loc[siteNoLst] dfG = dfGeo.loc[siteNoLst] # crd
from hydroDL.data import gageII import numpy as np import pandas as pd import os varLst = ['ECO2_BAS_DOM', 'ECO3_BAS_DOM'] dfR = gageII.readData(varLst=varLst) dfR = gageII.updateCode(dfR) fileEco3 = r'C:\Users\geofk\work\map\ecoRegion\tabEco3.csv' tabEco3 = pd.read_csv(fileEco3) fileLookup = os.path.join(gageII.dirTab, 'conterm_x_ecoregion3_names.csv') tabLookup = pd.read_csv(fileLookup) len(np.sort(dfR['ECO3_BAS_DOM'].unique())) codeLst = list(range(1, 85)) dfT = pd.DataFrame(index=codeLst, columns=['Eco2', 'Eco3', 'Eco3_Name']) for code in codeLst: eco2 = dfR[dfR['ECO3_BAS_DOM'] == code]['ECO2_BAS_DOM'].unique() eco3Name = tabLookup[tabLookup['ECO3_CODE'] == code]['ECO3_NAME'].values if len(eco3Name) == 1: eco3 = tabEco3[tabEco3['NA_L3NAME'] == eco3Name[0]]['NA_L3CODE'].values dfT.at[code, 'Eco3_Name'] = eco3Name[0] if len(eco2) == 1: dfT.at[code, 'Eco2'] = eco2[0] if len(eco3) == 1: dfT.at[code, 'Eco3'] = eco3[0] fileT = os.path.join(gageII.dirTab, 'EcoTab.csv') dfT.to_csv(fileT)
fig.suptitle(title) fig.show() fig.savefig(os.path.join(figFolder, figName)) siteNoLst = wqData.info['siteNo'].unique().tolist() dfHBN = pd.read_csv(os.path.join(kPath.dirData, 'USGS', 'inventory', 'HBN.csv'), dtype={ 'siteNo': str }).set_index('siteNo') siteNoHBN = [siteNo for siteNo in dfHBN.index.tolist() if siteNo in siteNoLst] dropColLst = [ 'STANAME', 'WR_REPORT_REMARKS', 'ADR_CITATION', 'SCREENING_COMMENTS' ] dfX = gageII.readData(siteNoLst=siteNoLst).drop(columns=dropColLst) dfX = gageII.updateCode(dfX) unitConv = 0.3048**3 * 365 * 24 * 60 * 60 / 1000**2 # area vs error indHBN = [siteNoLst.index(siteNo) for siteNo in siteNoHBN] area = dfX['DRAIN_SQKM'].values errMat = errMatLst2[0] code = '00605' # code = '00955' err = errMat[:, wqData.varC.index(code), 1] fig, ax = plt.subplots(1, 1) ax.plot(area, err, 'b*') ax.plot(area[indHBN], err[indHBN], 'r*') # np.nanmedian(err) # np.nanmedian(err[indHBN, :]) fig.show()
from hydroDL import kPath from hydroDL.app import waterQuality from hydroDL.data import gageII import pandas as pd import numpy as np import os import time # all gages fileSiteNo = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteNoLst-1979') siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist() tabSel = gageII.readData( varLst=['CLASS'], siteNoLst=siteNoLstAll) tabSel = gageII.updateCode(tabSel) siteNoLst = tabSel[tabSel['CLASS'] == 1].index.tolist() # wqData = waterQuality.DataModelWQ.new('basinRef', siteNoLst) wqData = waterQuality.DataModelWQ('basinRef') # indYr1 = waterQuality.indYr(wqData.info, yrLst=[1979, 2000])[0] # wqData.saveSubset('Y8090', indYr1) # indYr2 = waterQuality.indYr(wqData.info, yrLst=[2000, 2020])[0] # wqData.saveSubset('Y0010', indYr2) indYrO, indYrE = waterQuality.indYrOddEven(wqData.info) wqData.saveSubset('Yodd', indYrO) wqData.saveSubset('Yeven', indYrE)
def wrapData(caseName, siteNoLst, rho=365, freq='D', optC='end'): """ wrap up input and target data for the model,as: x=[nT,nP,nX] y=[nP,nY] c=[nP,nC] where nP is number of time series Arguments: caseName {str} -- name of current data case siteNoLst {list} -- list of USGS site Keyword Arguments: rho {int} -- [description] (default: {365}) nFill {int} -- max number of continous nan to interpolate in input data (default: {5}) varC {list} -- list of water quality code to learn (default: {usgs.lstCodeSample}) varG {list} -- list of constant variables in gageII (default: {gageII.lstWaterQuality}) varQ and varF are fixed so far """ sd = np.datetime64('1979-01-01') ed = np.datetime64('2019-12-31') # ts data varF = gridMET.varLst + ntn.varLst varC = usgs.varC varQ = usgs.varQ varG = gageII.lstWaterQuality # gageII tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) # read data and merge to: x=[nT,nP,nX], xc=[nP,nY] fLst, qLst, cLst, gLst = [list() for x in range(4)] infoLst = list() t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() varLst = varQ + varC + varF df = readSiteTS(siteNo, varLst=varLst, freq=freq) dfC = df[varC].dropna(how='all') for k in range(len(dfC)): ct = dfC.index[k] if freq == 'D': ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct) elif freq == 'W': ctR = pd.date_range(ct - pd.Timedelta(days=rho * 7 - 1), ct, freq='W-TUE') if (ctR[0] < sd) or (ctR[-1] > ed): continue for lst, var in zip([fLst, qLst], [varF, varQ]): temp = pd.DataFrame({ 'date': ctR }).set_index('date').join(df[var]) # temp = temp.interpolate( # limit=nFill, limit_direction='both', limit_area='inside') # give up interpolation after many thoughts lst.append(temp.values) if optC == 'end': cLst.append(dfC.iloc[k].values) elif optC == 'seq': tempC = pd.DataFrame({ 'date': ctR }).set_index('date').join(df[varC]) cLst.append(tempC.values) gLst.append(tabG.loc[siteNo].values) infoLst.append(dict(siteNo=siteNo, date=ct)) t2 = time.time() print('{} on site {} reading {:.3f} total {:.3f}'.format( i, siteNo, t2 - t1, t2 - t0)) f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32) q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32) g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32) if optC == 'end': c = np.stack(cLst, axis=-1).swapaxes(0, 1).astype(np.float32) elif optC == 'seq': c = np.stack(cLst, axis=-1).swapaxes(1, 2).astype(np.float32) # save infoDf = pd.DataFrame(infoLst) saveFolder = os.path.join(kPath.dirWQ, 'trainData') saveName = os.path.join(saveFolder, caseName) np.savez(saveName, q=q, f=f, c=c, g=g) infoDf.to_csv(saveName + '.csv') dictData = dict(name=caseName, rho=rho, varG=varG, varC=varC, varQ=varQ, varF=varF, siteNoLst=siteNoLst) with open(saveName + '.json', 'w') as fp: json.dump(dictData, fp, indent=4)
siteNoLst = wqData.info.iloc[wqData.subset[trainSet]].siteNo.unique() dfCrd = gageII.readData( varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst) lat = dfCrd['LAT_GAGE'].values lon = dfCrd['LNG_GAGE'].values shortName = usgs.codePdf.loc[code]['shortName'] axplot.mapPoint(axM[k], lat, lon, corrLst[k][:, 1], vRange=[0.5, 1], s=16) axM[k].set_title('Testing correlation of {}'.format(shortName)) # axplot.mapPoint(axM[k], lat, lon, rmseLst[k][:, 1], s=16) # axM[k].set_title('Testing RMSE of {}'.format(shortName)) plt.tight_layout() figM.show() # get rid of 00010 and 00095 siteLst = list() for k, code in enumerate(codeLst): trainSet = '{}-Y1'.format(code) siteNoLst = wqData.info.iloc[wqData.subset[trainSet]].siteNo.unique() siteLst.append(siteNoLst) siteNoAll = np.unique(np.concatenate(siteLst)) varG = ['GEOL_REEDBUSH_DOM', 'GEOL_HUNT_DOM_CODE'] dfGeog = gageII.readData( varLst=varG+['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst) dfGeog = gageII.updateCode(dfGeog) lat = dfGeog['LAT_GAGE'].values lon = dfGeog['LNG_GAGE'].values figM, axM = plt.subplots(len(varG), 1, figsize=(6, 8)) for k, var in enumerate(varG): axplot.mapPoint(axM[k], lat, lon, dfGeog[var], s=16) axM[k].set_title(var) figM.show()
import os import json import pandas as pd import numpy as np import matplotlib.pyplot as plt import time import scipy from mpl_toolkits import basemap dirSel = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteSel') with open(os.path.join(dirSel, 'dictRB_Y30N5.json')) as f: dictSite = json.load(f) siteNoLst = dictSite['comb'] dfCrd1 = gageII.readData( varLst=['LAT_GAGE', 'LNG_GAGE', 'CLASS'], siteNoLst=siteNoLst) dfCrd1 = gageII.updateCode(dfCrd1) dirSel = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteSel') with open(os.path.join(dirSel, 'dictRB_Y30N5.json')) as f: dictSiteN5 = json.load(f) with open(os.path.join(dirSel, 'dictRB_Y30N2.json')) as f: dictSiteN2 = json.load(f) codeLst = sorted(usgs.newC) dictSite = dict() for code in usgs.newC+['comb']: siteNoCode = list(set(dictSiteN2[code])-set(dictSiteN5['comb'])) dictSite[code] = siteNoCode siteNoLst = dictSite['comb'] nSite = len(siteNoLst) dfCrd2 = gageII.readData(
corrMat = np.full([len(siteNoLst), len(codeLst), 3], np.nan) for ic, code in enumerate(codeLst): for siteNo in dictSite[code]: indS = siteNoLst.index(siteNo) v1 = dictLSTM[siteNo][code].iloc[ind2].values v2 = dictWRTDS[siteNo][code].iloc[ind2].values v3 = dictObs[siteNo][code].iloc[ind2].values rmse1, corr1 = utils.stat.calErr(v1, v2) rmse2, corr2 = utils.stat.calErr(v1, v3) rmse3, corr3 = utils.stat.calErr(v2, v3) corrMat[indS, ic, 0] = corr1 corrMat[indS, ic, 1] = corr2 corrMat[indS, ic, 2] = corr3 dfRef = gageII.readData(varLst=['CLASS'], siteNoLst=siteNoLst) dfRef = gageII.updateCode(dfRef) indRef = np.where(dfRef['CLASS'].values == 1)[0] indNonRef = np.where(dfRef['CLASS'].values == 0)[0] # plot box labLst1 = [ usgs.codePdf.loc[code]['shortName'] + '\n' + code for code in codeLst ] labLst2 = ['LSTM vs WRTDS', 'LSTM vs Obs', 'WRTDS vs Obs'] dataBox = list() for k in range(len(codeLst)): code = codeLst[k] temp = list() for i in [0, 1, 2]: temp.append(corrMat[indRef, k, i]) dataBox.append(temp)
if len(wqData.c.shape) == 3: p = yP[-1, :, master['varY'].index(code)] o = wqData.c[-1, ind, ic] elif len(wqData.c.shape) == 2: p = ycP[:, master['varYC'].index(code)] o = wqData.c[ind, ic] for siteNo in dictSite[code]: iS = siteNoLst.index(siteNo) indS = info[info['siteNo'] == siteNo].index.values rmse, corr = utils.stat.calErr(p[indS], o[indS]) corrMat[iS, iCode, iT] = corr rmseMat[iS, iCode, iT] = rmse # reference basins tabRef = gageII.readData(varLst=['CLASS'], siteNoLst=siteNoLst) tabRef = gageII.updateCode(tabRef) bRef = (tabRef['CLASS'] == 1).values ind1 = np.where(bRef)[0] ind2 = np.where(~bRef)[0] # plot box labLst1 = [ usgs.codePdf.loc[code]['shortName'] + '\n' + code for code in codeLst ] labLst2 = ['train-ref', 'train-nonref', 'test-ref', 'test-nonref'] dataBox = list() for k in range(len(codeLst)): code = codeLst[k] temp = list() for i in range(corrMat.shape[2]):