Example #1
0
def testWRTDS(dataName, trainSet, testSet, codeLst):
    DF = dbBasin.DataFrameBasin(dataName)
    # Calculate WRTDS from train and test set
    varX = ['00060']
    varY = codeLst
    d1 = dbBasin.DataModelBasin(DF, subset=trainSet, varX=varX, varY=varY)
    d2 = dbBasin.DataModelBasin(DF, subset=testSet, varX=varX, varY=varY)
    tt1 = pd.to_datetime(d1.t)
    yr1 = tt1.year.values
    t1 = yr1 + tt1.dayofyear.values / 365
    sinT1 = np.sin(2 * np.pi * t1)
    cosT1 = np.cos(2 * np.pi * t1)
    tt2 = pd.to_datetime(d2.t)
    yr2 = tt2.year.values
    t2 = yr2 + tt2.dayofyear.values / 365
    sinT2 = np.sin(2 * np.pi * t2)
    cosT2 = np.cos(2 * np.pi * t2)
    ###
    yOut = np.full([len(d2.t), len(d2.siteNoLst), len(varY)], np.nan)
    t0 = time.time()
    for indS, siteNo in enumerate(d2.siteNoLst):
        for indC, code in enumerate(varY):
            print('{} {} {} {}'.format(indS, siteNo, code, time.time() - t0))
            y1 = d1.Y[:, indS, indC].copy()
            q1 = d1.X[:, indS, 0].copy()
            q1[q1 < 0] = 0
            logq1 = np.log(q1 + sn)
            x1 = np.stack([logq1, yr1, sinT1, cosT1]).T
            y2 = d2.Y[:, indS, indC].copy()
            q2 = d2.X[:, indS, 0].copy()
            q2[q2 < 0] = 0
            logq2 = np.log(q2 + sn)
            x2 = np.stack([logq2, yr2, sinT2, cosT2]).T
            [xx1, yy1], ind1 = utils.rmNan([x1, y1])
            if testSet == 'all':
                [xx2], ind2 = utils.rmNan([x2])
            else:
                [xx2, yy2], ind2 = utils.rmNan([x2, y2])
            if len(ind1) < 40:
                continue
            for k in ind2:
                dY = np.abs(t2[k] - t1[ind1])
                dQ = np.abs(logq2[k] - logq1[ind1])
                dS = np.min(np.stack(
                    [abs(np.ceil(dY) - dY),
                     abs(dY - np.floor(dY))]),
                            axis=0)
                d = np.stack([dY, dQ, dS])
                ww, ind = calWeight(d)
                model = sm.WLS(yy1[ind], xx1[ind], weights=ww).fit()
                yp = model.predict(x2[k, :])[0]
                yOut[k, indS, indC] = yp
    return yOut
Example #2
0
def trainModel(outName):
    outFolder = nameFolder(outName)
    dictP = loadMaster(outName)

    # load data
    DF = dbBasin.DataFrameBasin(dictP['dataName'])
    dictVar = {k: dictP[k]
               for k in ('varX', 'varXC', 'varY', 'varYC')}
    DM = dbBasin.DataModelBasin(DF, subset=dictP['trainSet'], **dictVar)
    if dictP['borrowStat'] is not None:
        DM.loadStat(dictP['borrowStat'])
    DM.trans(mtdX=dictP['mtdX'], mtdXC=dictP['mtdXC'],
             mtdY=dictP['mtdY'], mtdYC=dictP['mtdYC'])
    DM.saveStat(outFolder)
    dataTup = DM.getData()
    dataTup = trainBasin.dealNaN(dataTup, dictP['optNaN'])

    # define loss
    lossFun = getattr(crit, dictP['crit'])()
    # define model
    model = defineModel(dataTup, dictP)

    if torch.cuda.is_available():
        lossFun = lossFun.cuda()
        model = model.cuda()

    if dictP['optim'] == 'AdaDelta':
        optim = torch.optim.Adadelta(model.parameters())
    else:
        raise RuntimeError('optimizor function not specified')

    lossLst = list()
    nEp = dictP['nEpoch']
    sEp = dictP['saveEpoch']
    logFile = os.path.join(outFolder, 'log')
    if os.path.exists(logFile):
        os.remove(logFile)
    for k in range(0, nEp, sEp):
        model, optim, lossEp = trainBasin.trainModel(
            dataTup, model, lossFun, optim, batchSize=dictP['batchSize'],
            nEp=sEp, cEp=k, logFile=logFile,
            optBatch=dictP['optBatch'], nIterEp=dictP['nIterEp'])
        # save model
        saveModelState(outName, k+sEp, model, optim=optim)
        lossLst = lossLst+lossEp

    lossFile = os.path.join(outFolder, 'loss.csv')
    pd.DataFrame(lossLst).to_csv(lossFile, index=False, header=False)
Example #3
0
def testModel(outName,  DF=None, testSet='all', ep=None, reTest=False, batchSize=20):
    # load master
    dictP = loadMaster(outName)
    if ep is None:
        ep = dictP['nEpoch']
    outFolder = nameFolder(outName)
    testFileName = 'testP-{}-Ep{}.npz'.format(testSet, ep)
    testFile = os.path.join(outFolder, testFileName)

    if os.path.exists(testFile) and reTest is False:
        print('load saved test result')
        npz = np.load(testFile, allow_pickle=True)
        yP = npz['yP']
        ycP = npz['ycP']
    else:
        # load test data
        if DF is None:
            DF = dbBasin.DataFrameBasin(dictP['dataName'])
        dictVar = {k: dictP[k]
                   for k in ('varX', 'varXC', 'varY', 'varYC')}
        DM = dbBasin.DataModelBasin(DF, subset=testSet, **dictVar)
        DM.loadStat(outFolder)
        dataTup = DM.getData()
        dataTup = trainBasin.dealNaN(dataTup, dictP['optNaN'])

        model = defineModel(dataTup, dictP)
        model = loadModelState(outName, ep, model)
        # test
        x = dataTup[0]
        xc = dataTup[1]
        ny = np.shape(dataTup[2])[2]
        # test model - point by point
        yOut, ycOut = trainBasin.testModel(
            model, x, xc, ny, batchSize=batchSize)
        yP = DM.transOutY(yOut)
        ycP = DM.transOutYC(ycOut)
        np.savez(testFile, yP=yP, ycP=ycP)
    return yP, ycP
Example #4
0
from hydroDL.data import usgs, gageII, gridMET, ntn, GLASS
from hydroDL.master import slurm
from hydroDL.data import dbBasin
from hydroDL.master import basinFull

if __name__ == '__main__':
    dataNameLst = ['G200Norm', 'G400Norm']
    for dataName in dataNameLst:
        outName = dataName
        DF = dbBasin.DataFrameBasin(dataName)
        testSet = 'all'
        try:
            yP, ycP = basinFull.testModel(outName,
                                          DF=DF,
                                          testSet=testSet,
                                          ep=200,
                                          reTest=True)
            print('tested {}'.format(outName), flush=True)
        except:
            print('skiped {}'.format(outName), flush=True)
Example #5
0
import numpy as np
import matplotlib.pyplot as plt
from hydroDL.post import axplot, figplot
from hydroDL import kPath, utils
import json
import os
import importlib
from hydroDL.master import basinFull
from hydroDL.app.waterQuality import WRTDS

dataName = 'G400Norm'
outName = dataName
trainSet = 'rmRT20'
testSet = 'pkRT20'

DF = dbBasin.DataFrameBasin(outName)
yP, ycP = basinFull.testModel(outName, DF=DF, testSet=testSet, ep=500)

# deal with mean and std
codeLst = usgs.newC
yOut = np.ndarray(yP.shape)
for k, code in enumerate(codeLst):
    m = DF.g[:, DF.varG.index(code + '-M')]
    s = DF.g[:, DF.varG.index(code + '-S')]
    data = yP[:, :, k]
    yOut[:, :, k] = data * s + m

# WRTDS
# yW = WRTDS.testWRTDS(dataName, trainSet, testSet, codeLst)
dirRoot = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS-dbBasin')
fileName = '{}-{}-{}'.format(dataName, trainSet, 'all')
Example #6
0
import matplotlib.pyplot as plt
from hydroDL.post import axplot, figplot
from hydroDL import kPath, utils
import json
import os
import importlib
from hydroDL.master import basinFull
importlib.reload(utils)
dirSel = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteSel')
dictSiteName = 'dictWeathering.json'
with open(os.path.join(dirSel, dictSiteName)) as f:
    dictSite = json.load(f)
siteNoLst = dictSite['k12']

# normalize
DF = dbBasin.DataFrameBasin('weathering')
codeSel = ['00915', '00925', '00930', '00935', '00940', '00945', '00955']
DF = dbBasin.func.localNorm(DF, subset='rmD5')
DF.saveAs('weatheringNorm')
dataName = 'weatheringNorm'

label = 'test'
varX = DF.varF + DF.varQ
mtdX = dbBasin.io.extractVarMtd(varX)
varY = [c + '-N' for c in codeSel]
mtdY = dbBasin.io.extractVarMtd(varY)
varXC = gageII.varLst + [c + '-M'
                         for c in codeSel] + [c + '-S' for c in codeSel]
mtdXC = dbBasin.io.extractVarMtd(varXC)
varYC = None
mtdYC = dbBasin.io.extractVarMtd(varYC)
Example #7
0
import torch
from hydroDL.model import rnn, crit, trainBasin

outName = 'weathering-FPR2QC-t365-B10'
ep = 100

# save
outFolder = basinFull.nameFolder(outName)
modelFile = os.path.join(outFolder, 'model_ep{}'.format(ep))
model = torch.load(modelFile)
modelStateFile = os.path.join(outFolder, 'modelState_ep{}'.format(ep))
torch.save(model.state_dict(), modelStateFile)

# load
dictP = basinFull.loadMaster(outName)
DF = dbBasin.DataFrameBasin(dictP['dataName'])
dictVar = {k: dictP[k] for k in ('varX', 'varXC', 'varY', 'varYC')}
DM = dbBasin.DataModelBasin(DF, subset='A10', **dictVar)
DM.loadStat(outFolder)
dataTup = DM.getData()
[nx, nxc, ny, nyc, nt, ns] = trainBasin.getSize(dataTup)
dataTup = trainBasin.dealNaN(dataTup, dictP['optNaN'])
if dictP['modelName'] == 'CudnnLSTM':
    model = rnn.CudnnLstmModel(nx=nx + nxc,
                               ny=ny + nyc,
                               hiddenSize=dictP['hiddenSize'])
elif dictP['modelName'] == 'LstmModel':
    model = rnn.LstmModel(nx=nx + nxc,
                          ny=ny + nyc,
                          hiddenSize=dictP['hiddenSize'])
else:
Example #8
0
import matplotlib.pyplot as plt
# load all site counts
dirInv = os.path.join(kPath.dirData, 'USGS', 'inventory')
fileSiteNo = os.path.join(dirInv, 'siteNoLst-1979')
siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist()
codeLst = sorted(usgs.codeLst)
countD = np.load(os.path.join(dirInv, 'matCountDaily.npy'))

code = '00915'
indC = codeLst.index(code)
count = np.sum(countD[:, :, indC], axis=1)
indSLst = np.where(count > 200)[0]
siteNoLst = [siteNoLstAll[ind] for ind in indSLst]

# DF = dbBasin.DataFrameBasin.new('00915G200', siteNoLst)
DF = dbBasin.DataFrameBasin('00915G200')
q = DF.q[:, :, 1]
c = DF.c[:, :, DF.varC.index(code)]

ns = len(DF.siteNoLst)
out = np.ndarray([ns, 2])
for indS in range(ns):
    q1 = np.log(q[:, indS]+1)
    ind = np.where(~np.isnan(c[:, indS]))[0]
    q2 = np.log(q[ind, indS]+1)
    s, p = scipy.stats.ks_2samp(q1, q2)
    out[indS, 0] = s
    out[indS, 1] = len(ind)

dfCrd = gageII.readData(
    varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst)
Example #9
0
import scipy
import time
import matplotlib.pyplot as plt
from hydroDL.post import axplot, figplot
from hydroDL.master import basins
from hydroDL.data import gageII, usgs, gridMET
from hydroDL import kPath, utils
import os
import pandas as pd
import numpy as np
from hydroDL import kPath
from hydroDL.data import dbBasin, usgs

# create a dataFrame contains all C and Q

fileSiteNo = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteNoLst-1979')
siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist()

# varG = ['LAT_GAGE', 'LNG_GAGE', 'CLASS', 'DRAIN_SQKM']
# DF = dbBasin.DataFrameBasin.new(
#     'allCQ', siteNoLstAll, varF=['pr'], varQ=['00060'], varG=varG)

DF = dbBasin.DataFrameBasin('allCQ')
Example #10
0
import pandas as pd
from hydroDL.data import usgs, gageII, gridMET, ntn, GLASS, transform, dbBasin
import numpy as np
import matplotlib.pyplot as plt
from hydroDL.post import axplot, figplot
from hydroDL import kPath, utils
import json
import os
import importlib
from hydroDL.master import basinFull
import statsmodels.api as sm
import time

DF = dbBasin.DataFrameBasin('G400Norm')
trainSet = 'rmRT20'
testSet = 'all'

h = [7, 2, 0.5]
the = 100
sn = 1e-5

# Calculate WRTDS from train and test set
varX = ['00060', 'sinT', 'cosT', 'datenum']
varY = usgs.newC
d1 = dbBasin.DataModelBasin(DF, subset=trainSet, varX=varX, varY=varY)
d2 = dbBasin.DataModelBasin(DF, subset=testSet, varX=varX, varY=varY)
tt = pd.to_datetime(DF.t)
yr = tt.year.values
t = yr+tt.dayofyear.values/365
###
yOut = np.full([len(d2.t), len(d2.siteNoLst), len(varY)], np.nan)