def testWRTDS(dataName, trainSet, testSet, codeLst): DF = dbBasin.DataFrameBasin(dataName) # Calculate WRTDS from train and test set varX = ['00060'] varY = codeLst d1 = dbBasin.DataModelBasin(DF, subset=trainSet, varX=varX, varY=varY) d2 = dbBasin.DataModelBasin(DF, subset=testSet, varX=varX, varY=varY) tt1 = pd.to_datetime(d1.t) yr1 = tt1.year.values t1 = yr1 + tt1.dayofyear.values / 365 sinT1 = np.sin(2 * np.pi * t1) cosT1 = np.cos(2 * np.pi * t1) tt2 = pd.to_datetime(d2.t) yr2 = tt2.year.values t2 = yr2 + tt2.dayofyear.values / 365 sinT2 = np.sin(2 * np.pi * t2) cosT2 = np.cos(2 * np.pi * t2) ### yOut = np.full([len(d2.t), len(d2.siteNoLst), len(varY)], np.nan) t0 = time.time() for indS, siteNo in enumerate(d2.siteNoLst): for indC, code in enumerate(varY): print('{} {} {} {}'.format(indS, siteNo, code, time.time() - t0)) y1 = d1.Y[:, indS, indC].copy() q1 = d1.X[:, indS, 0].copy() q1[q1 < 0] = 0 logq1 = np.log(q1 + sn) x1 = np.stack([logq1, yr1, sinT1, cosT1]).T y2 = d2.Y[:, indS, indC].copy() q2 = d2.X[:, indS, 0].copy() q2[q2 < 0] = 0 logq2 = np.log(q2 + sn) x2 = np.stack([logq2, yr2, sinT2, cosT2]).T [xx1, yy1], ind1 = utils.rmNan([x1, y1]) if testSet == 'all': [xx2], ind2 = utils.rmNan([x2]) else: [xx2, yy2], ind2 = utils.rmNan([x2, y2]) if len(ind1) < 40: continue for k in ind2: dY = np.abs(t2[k] - t1[ind1]) dQ = np.abs(logq2[k] - logq1[ind1]) dS = np.min(np.stack( [abs(np.ceil(dY) - dY), abs(dY - np.floor(dY))]), axis=0) d = np.stack([dY, dQ, dS]) ww, ind = calWeight(d) model = sm.WLS(yy1[ind], xx1[ind], weights=ww).fit() yp = model.predict(x2[k, :])[0] yOut[k, indS, indC] = yp return yOut
def trainModel(outName): outFolder = nameFolder(outName) dictP = loadMaster(outName) # load data DF = dbBasin.DataFrameBasin(dictP['dataName']) dictVar = {k: dictP[k] for k in ('varX', 'varXC', 'varY', 'varYC')} DM = dbBasin.DataModelBasin(DF, subset=dictP['trainSet'], **dictVar) if dictP['borrowStat'] is not None: DM.loadStat(dictP['borrowStat']) DM.trans(mtdX=dictP['mtdX'], mtdXC=dictP['mtdXC'], mtdY=dictP['mtdY'], mtdYC=dictP['mtdYC']) DM.saveStat(outFolder) dataTup = DM.getData() dataTup = trainBasin.dealNaN(dataTup, dictP['optNaN']) # define loss lossFun = getattr(crit, dictP['crit'])() # define model model = defineModel(dataTup, dictP) if torch.cuda.is_available(): lossFun = lossFun.cuda() model = model.cuda() if dictP['optim'] == 'AdaDelta': optim = torch.optim.Adadelta(model.parameters()) else: raise RuntimeError('optimizor function not specified') lossLst = list() nEp = dictP['nEpoch'] sEp = dictP['saveEpoch'] logFile = os.path.join(outFolder, 'log') if os.path.exists(logFile): os.remove(logFile) for k in range(0, nEp, sEp): model, optim, lossEp = trainBasin.trainModel( dataTup, model, lossFun, optim, batchSize=dictP['batchSize'], nEp=sEp, cEp=k, logFile=logFile, optBatch=dictP['optBatch'], nIterEp=dictP['nIterEp']) # save model saveModelState(outName, k+sEp, model, optim=optim) lossLst = lossLst+lossEp lossFile = os.path.join(outFolder, 'loss.csv') pd.DataFrame(lossLst).to_csv(lossFile, index=False, header=False)
def testModel(outName, DF=None, testSet='all', ep=None, reTest=False, batchSize=20): # load master dictP = loadMaster(outName) if ep is None: ep = dictP['nEpoch'] outFolder = nameFolder(outName) testFileName = 'testP-{}-Ep{}.npz'.format(testSet, ep) testFile = os.path.join(outFolder, testFileName) if os.path.exists(testFile) and reTest is False: print('load saved test result') npz = np.load(testFile, allow_pickle=True) yP = npz['yP'] ycP = npz['ycP'] else: # load test data if DF is None: DF = dbBasin.DataFrameBasin(dictP['dataName']) dictVar = {k: dictP[k] for k in ('varX', 'varXC', 'varY', 'varYC')} DM = dbBasin.DataModelBasin(DF, subset=testSet, **dictVar) DM.loadStat(outFolder) dataTup = DM.getData() dataTup = trainBasin.dealNaN(dataTup, dictP['optNaN']) model = defineModel(dataTup, dictP) model = loadModelState(outName, ep, model) # test x = dataTup[0] xc = dataTup[1] ny = np.shape(dataTup[2])[2] # test model - point by point yOut, ycOut = trainBasin.testModel( model, x, xc, ny, batchSize=batchSize) yP = DM.transOutY(yOut) ycP = DM.transOutYC(ycOut) np.savez(testFile, yP=yP, ycP=ycP) return yP, ycP
from hydroDL.data import usgs, gageII, gridMET, ntn, GLASS from hydroDL.master import slurm from hydroDL.data import dbBasin from hydroDL.master import basinFull if __name__ == '__main__': dataNameLst = ['G200Norm', 'G400Norm'] for dataName in dataNameLst: outName = dataName DF = dbBasin.DataFrameBasin(dataName) testSet = 'all' try: yP, ycP = basinFull.testModel(outName, DF=DF, testSet=testSet, ep=200, reTest=True) print('tested {}'.format(outName), flush=True) except: print('skiped {}'.format(outName), flush=True)
import numpy as np import matplotlib.pyplot as plt from hydroDL.post import axplot, figplot from hydroDL import kPath, utils import json import os import importlib from hydroDL.master import basinFull from hydroDL.app.waterQuality import WRTDS dataName = 'G400Norm' outName = dataName trainSet = 'rmRT20' testSet = 'pkRT20' DF = dbBasin.DataFrameBasin(outName) yP, ycP = basinFull.testModel(outName, DF=DF, testSet=testSet, ep=500) # deal with mean and std codeLst = usgs.newC yOut = np.ndarray(yP.shape) for k, code in enumerate(codeLst): m = DF.g[:, DF.varG.index(code + '-M')] s = DF.g[:, DF.varG.index(code + '-S')] data = yP[:, :, k] yOut[:, :, k] = data * s + m # WRTDS # yW = WRTDS.testWRTDS(dataName, trainSet, testSet, codeLst) dirRoot = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS-dbBasin') fileName = '{}-{}-{}'.format(dataName, trainSet, 'all')
import matplotlib.pyplot as plt from hydroDL.post import axplot, figplot from hydroDL import kPath, utils import json import os import importlib from hydroDL.master import basinFull importlib.reload(utils) dirSel = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteSel') dictSiteName = 'dictWeathering.json' with open(os.path.join(dirSel, dictSiteName)) as f: dictSite = json.load(f) siteNoLst = dictSite['k12'] # normalize DF = dbBasin.DataFrameBasin('weathering') codeSel = ['00915', '00925', '00930', '00935', '00940', '00945', '00955'] DF = dbBasin.func.localNorm(DF, subset='rmD5') DF.saveAs('weatheringNorm') dataName = 'weatheringNorm' label = 'test' varX = DF.varF + DF.varQ mtdX = dbBasin.io.extractVarMtd(varX) varY = [c + '-N' for c in codeSel] mtdY = dbBasin.io.extractVarMtd(varY) varXC = gageII.varLst + [c + '-M' for c in codeSel] + [c + '-S' for c in codeSel] mtdXC = dbBasin.io.extractVarMtd(varXC) varYC = None mtdYC = dbBasin.io.extractVarMtd(varYC)
import torch from hydroDL.model import rnn, crit, trainBasin outName = 'weathering-FPR2QC-t365-B10' ep = 100 # save outFolder = basinFull.nameFolder(outName) modelFile = os.path.join(outFolder, 'model_ep{}'.format(ep)) model = torch.load(modelFile) modelStateFile = os.path.join(outFolder, 'modelState_ep{}'.format(ep)) torch.save(model.state_dict(), modelStateFile) # load dictP = basinFull.loadMaster(outName) DF = dbBasin.DataFrameBasin(dictP['dataName']) dictVar = {k: dictP[k] for k in ('varX', 'varXC', 'varY', 'varYC')} DM = dbBasin.DataModelBasin(DF, subset='A10', **dictVar) DM.loadStat(outFolder) dataTup = DM.getData() [nx, nxc, ny, nyc, nt, ns] = trainBasin.getSize(dataTup) dataTup = trainBasin.dealNaN(dataTup, dictP['optNaN']) if dictP['modelName'] == 'CudnnLSTM': model = rnn.CudnnLstmModel(nx=nx + nxc, ny=ny + nyc, hiddenSize=dictP['hiddenSize']) elif dictP['modelName'] == 'LstmModel': model = rnn.LstmModel(nx=nx + nxc, ny=ny + nyc, hiddenSize=dictP['hiddenSize']) else:
import matplotlib.pyplot as plt # load all site counts dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory') fileSiteNo = os.path.join(dirInv, 'siteNoLst-1979') siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist() codeLst = sorted(usgs.codeLst) countD = np.load(os.path.join(dirInv, 'matCountDaily.npy')) code = '00915' indC = codeLst.index(code) count = np.sum(countD[:, :, indC], axis=1) indSLst = np.where(count > 200)[0] siteNoLst = [siteNoLstAll[ind] for ind in indSLst] # DF = dbBasin.DataFrameBasin.new('00915G200', siteNoLst) DF = dbBasin.DataFrameBasin('00915G200') q = DF.q[:, :, 1] c = DF.c[:, :, DF.varC.index(code)] ns = len(DF.siteNoLst) out = np.ndarray([ns, 2]) for indS in range(ns): q1 = np.log(q[:, indS]+1) ind = np.where(~np.isnan(c[:, indS]))[0] q2 = np.log(q[ind, indS]+1) s, p = scipy.stats.ks_2samp(q1, q2) out[indS, 0] = s out[indS, 1] = len(ind) dfCrd = gageII.readData( varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst)
import scipy import time import matplotlib.pyplot as plt from hydroDL.post import axplot, figplot from hydroDL.master import basins from hydroDL.data import gageII, usgs, gridMET from hydroDL import kPath, utils import os import pandas as pd import numpy as np from hydroDL import kPath from hydroDL.data import dbBasin, usgs # create a dataFrame contains all C and Q fileSiteNo = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteNoLst-1979') siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist() # varG = ['LAT_GAGE', 'LNG_GAGE', 'CLASS', 'DRAIN_SQKM'] # DF = dbBasin.DataFrameBasin.new( # 'allCQ', siteNoLstAll, varF=['pr'], varQ=['00060'], varG=varG) DF = dbBasin.DataFrameBasin('allCQ')
import pandas as pd from hydroDL.data import usgs, gageII, gridMET, ntn, GLASS, transform, dbBasin import numpy as np import matplotlib.pyplot as plt from hydroDL.post import axplot, figplot from hydroDL import kPath, utils import json import os import importlib from hydroDL.master import basinFull import statsmodels.api as sm import time DF = dbBasin.DataFrameBasin('G400Norm') trainSet = 'rmRT20' testSet = 'all' h = [7, 2, 0.5] the = 100 sn = 1e-5 # Calculate WRTDS from train and test set varX = ['00060', 'sinT', 'cosT', 'datenum'] varY = usgs.newC d1 = dbBasin.DataModelBasin(DF, subset=trainSet, varX=varX, varY=varY) d2 = dbBasin.DataModelBasin(DF, subset=testSet, varX=varX, varY=varY) tt = pd.to_datetime(DF.t) yr = tt.year.values t = yr+tt.dayofyear.values/365 ### yOut = np.full([len(d2.t), len(d2.siteNoLst), len(varY)], np.nan)