コード例 #1
0
def wrapData(caseName,
             siteNoLst,
             nFill=5,
             freq='D',
             sdStr='1979-01-01',
             edStr='2019-12-31'):
    varF = gridMET.varLst
    varQ = usgs.varQ
    varG = gageII.lstWaterQuality
    varC = usgs.newC

    # gageII
    tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst)
    tabG = gageII.updateCode(tabG)
    tR = pd.date_range(np.datetime64(sdStr), np.datetime64(edStr))
    fLst, qLst, gLst, cLst = [list() for x in range(4)]

    t0 = time.time()
    for i, siteNo in enumerate(siteNoLst):
        t1 = time.time()
        varLst = varQ + varF + varC
        df = readSiteTS(siteNo, varLst=varLst, freq=freq)
        # streamflow
        tempQ = pd.DataFrame({'date': tR}).set_index('date').join(df[varQ])
        qLst.append(tempQ.values)
        # forcings
        tempF = pd.DataFrame({'date': tR}).set_index('date').join(df[varF])
        tempF = tempF.interpolate(limit=nFill,
                                  limit_direction='both',
                                  limit_area='inside')
        fLst.append(tempF.values)
        # # water quality
        tempC = pd.DataFrame({'date': tR}).set_index('date').join(df[varC])
        cLst.append(tempC.values)
        # geog
        gLst.append(tabG.loc[siteNo].values)
        t2 = time.time()
        print('{} on site {} reading {:.3f} total {:.3f}'.format(
            i, siteNo, t2 - t1, t2 - t0))
    f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    c = np.stack(cLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32)

    # save
    saveFolder = caseFolder(caseName)
    if not os.path.exists(saveFolder):
        os.mkdir(saveFolder)
    np.savez_compressed(os.path.join(saveFolder, 'data'), c=c, q=q, f=f, g=g)
    dictData = dict(name=caseName,
                    varG=varG,
                    varQ=varQ,
                    varF=varF,
                    varC=varC,
                    sd=sdStr,
                    ed=edStr,
                    freq=freq,
                    siteNoLst=siteNoLst)
    with open(os.path.join(saveFolder, 'info') + '.json', 'w') as fp:
        json.dump(dictData, fp, indent=4)
コード例 #2
0
def wrapData(caseName,
             siteNoLst,
             nFill=5,
             freq='D',
             sdStr='1979-01-01',
             edStr='2019-12-31',
             varF=gridMET.varLst + ntn.varLst + GLASS.varLst,
             varQ=usgs.varQ,
             varG=gageII.varLst,
             varC=usgs.newC):
    # gageII
    tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst)
    tabG = gageII.updateCode(tabG)
    tR = pd.date_range(np.datetime64(sdStr), np.datetime64(edStr))
    fLst, qLst, gLst, cLst = [list() for x in range(4)]

    t0 = time.time()
    for i, siteNo in enumerate(siteNoLst):
        t1 = time.time()
        varLst = varQ + varF + varC
        df = readSiteTS(siteNo, varLst=varLst, freq=freq)
        # streamflow
        tempQ = pd.DataFrame({'date': tR}).set_index('date').join(df[varQ])
        qLst.append(tempQ.values)
        # forcings
        tempF = pd.DataFrame({'date': tR}).set_index('date').join(df[varF])
        tempF = tempF.interpolate(limit=nFill,
                                  limit_direction='both',
                                  limit_area='inside')
        fLst.append(tempF.values)
        # # water quality
        tempC = pd.DataFrame({'date': tR}).set_index('date').join(df[varC])
        cLst.append(tempC.values)
        # geog
        gLst.append(tabG.loc[siteNo].values)
        t2 = time.time()
        print('{} on site {} reading {:.3f} total {:.3f}'.format(
            i, siteNo, t2 - t1, t2 - t0))
    f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    c = np.stack(cLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32)

    # save
    saveDataFrame(caseName,
                  c=c,
                  q=q,
                  f=f,
                  g=g,
                  varC=varC,
                  varQ=varQ,
                  varF=varF,
                  varG=varG,
                  sdStr=sdStr,
                  edStr=edStr,
                  freq=freq,
                  siteNoLst=siteNoLst)
コード例 #3
0
ファイル: parWRTDS.py プロジェクト: sadeghst/geolearn
# WRTDS corr
dirWrtds = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS-D', 'All')
fileC = os.path.join(dirWrtds, 'corr')
dfCorr = pd.read_csv(fileC, dtype={'siteNo': str}).set_index('siteNo')

code = '00915'
codeName = usgs.codePdf.loc[code]['shortName']
# load WRTDS par
fileP = os.path.join(dirWrtds, 'params', code)
dfPar = pd.read_csv(fileP, dtype={'siteNo': str}).set_index('siteNo')
# select site by count
n = 40 * 2
dfParSel = dfPar[dfPar['count'] > n]
siteNoLst = dfParSel.index.tolist()
dfCorrSel = dfCorr.loc[siteNoLst][code]
dfCrd = gageII.readData(varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst)
dfCrd = gageII.updateCode(dfCrd)
lat = dfCrd['LAT_GAGE'].values
lon = dfCrd['LNG_GAGE'].values

# plot map
parLst = ['pQ', 'pSinT', 'pCosT', 'pYr', 'b']
figM, axM = plt.subplots(3, 2, figsize=(12, 16))
axplot.mapPoint(axM[0, 0], lat, lon, dfCorrSel.values, s=16)
axM[0, 0].set_title('WRTDS corr {}'.format(codeName))
for k, par in enumerate(parLst):
    iy, ix = utils.index2d(k + 1, 3, 2)
    axplot.mapPoint(axM[iy, ix], lat, lon, dfParSel[par].values, s=16)
    axM[iy, ix].set_title('WRTDS {} {}'.format(par, codeName))
figM.show()
コード例 #4
0
ファイル: basins.py プロジェクト: sadeghst/geolearn
def testModelSeq(outName,
                 siteNoLst,
                 wqData=None,
                 ep=None,
                 returnOut=False,
                 retest=False,
                 sd=np.datetime64('1979-01-01'),
                 ed=np.datetime64('2019-12-31')):
    # run sequence test for all sites, default to be from first date to last date
    if type(siteNoLst) is not list:
        siteNoLst = [siteNoLst]
    master = loadMaster(outName)
    if master['crit'] == 'SigmaLoss':
        doSigma = True
    else:
        doSigma = False
    if ep is None:
        ep = master['nEpoch']
    outDir = nameFolder(outName)
    sdS = pd.to_datetime(sd).strftime('%Y%m%d')
    edS = pd.to_datetime(ed).strftime('%Y%m%d')
    saveDir = os.path.join(outDir, 'seq-{}-{}-ep{}'.format(sdS, edS, ep))
    if not os.path.exists(saveDir):
        os.mkdir(saveDir)
    siteSaveLst = os.listdir(saveDir)
    if retest is True:
        sitePredLst = siteNoLst
    else:
        sitePredLst = [
            siteNo for siteNo in siteNoLst if siteNo not in siteSaveLst
        ]
    if len(sitePredLst) != 0:
        if wqData is None:
            wqData = waterQuality.DataModelWQ(master['dataName'])
        (varX, varXC, varY, varYC) = (master['varX'], master['varXC'],
                                      master['varY'], master['varYC'])
        (statX, statXC, statY, statYC) = loadStat(outName)
        model = loadModel(outName, ep=ep)
        tabG = gageII.readData(varLst=varXC, siteNoLst=siteNoLst)
        tabG = gageII.updateCode(tabG)
        for siteNo in sitePredLst:
            if 'DRAIN_SQKM' in varXC:
                area = tabG[tabG.index == siteNo]['DRAIN_SQKM'].values[0]
            else:
                area = None
            # test model
            print('testing {} from {} to {}'.format(siteNo, sdS, edS))
            freq = wqData.freq
            dfX = waterQuality.readSiteTS(siteNo,
                                          varX,
                                          freq=freq,
                                          area=area,
                                          sd=sd,
                                          ed=ed)
            # dfX = waterQuality.readSiteX(
            #     siteNo, varX, sd=sd, ed=ed, area=area, nFill=5)
            xA = np.expand_dims(dfX.values, axis=1)
            xcA = np.expand_dims(tabG.loc[siteNo].values.astype(np.float),
                                 axis=0)
            mtdX = waterQuality.extractVarMtd(varX)
            x = transform.transInAll(xA, mtdX, statLst=statX)
            mtdXC = waterQuality.extractVarMtd(varXC)
            xc = transform.transInAll(xcA, mtdXC, statLst=statXC)
            [x, xc] = trainTS.dealNaN([x, xc], master['optNaN'][:2])
            yOut = trainTS.testModel(model, x, xc)
            # transfer out
            nt = len(dfX)
            ny = len(varY) if varY is not None else 0
            nyc = len(varYC) if varYC is not None else 0
            if doSigma:
                yP = np.full([nt, ny + nyc], np.nan)
                sP = np.full([nt, ny + nyc], np.nan)
                yP[:, :ny] = wqData.transOut(yOut[:, 0, :ny * 2:2], statY,
                                             varY)
                yP[:, ny:] = wqData.transOut(yOut[:, 0, ny * 2::2], statYC,
                                             varYC)
                sP[:, :ny] = wqData.transOut(
                    np.sqrt(np.exp(yOut[:, 0, 1:ny * 2:2])), statY, varY)
                sP[:, ny:] = wqData.transOut(
                    np.sqrt(np.exp(yOut[:, 0, ny * 2 + 1::2])), statYC, varYC)
            else:
                yP = np.full([nt, ny + nyc], np.nan)
                yP[:, :ny] = wqData.transOut(yOut[:, 0, :ny], statY, varY)
                yP[:, ny:] = wqData.transOut(yOut[:, 0, ny:], statYC, varYC)
            # save output
            t = dfX.index.values.astype('datetime64[D]')
            colY = [] if varY is None else varY
            colYC = [] if varYC is None else varYC
            dfOut = pd.DataFrame(data=yP, columns=[colY + colYC], index=t)
            dfOut.index.name = 'date'
            dfOut = dfOut.reset_index()
            dfOut.to_csv(os.path.join(saveDir, siteNo), index=False)
            if doSigma:
                dfOutS = pd.DataFrame(data=sP, columns=[colY + colYC], index=t)
                dfOutS.index.name = 'date'
                dfOutS = dfOut.reset_index()
                dfOutS.to_csv(os.path.join(saveDir, siteNo + '_sigma'),
                              index=False)
    # load all csv
    if returnOut:
        dictOut = dict()
        for siteNo in siteNoLst:
            # print('loading {} from {} to {}'.format(siteNo, sdS, edS))
            dfOut = pd.read_csv(os.path.join(saveDir, siteNo))
            dictOut[siteNo] = dfOut
            if doSigma:
                dfOut = pd.read_csv(os.path.join(saveDir, siteNo + '_sigma'))
                dictOut[siteNo + '_sigma'] = dfOut
        return dictOut
コード例 #5
0
ファイル: 121_tsMap_rmse.py プロジェクト: sadeghst/geolearn
reTest = False
dataName = 'rbWN5'
siteNoLst = dictSite['comb']
nSite = len(siteNoLst)

# load all sequence
if False:
    importlib.reload(wq.wqLoad)
    outNameLSTM = '{}-{}-{}-{}'.format('rbWN5', 'comb', 'QTFP_C', 'comb-B10')
    dictLSTM, dictWRTDS, dictObs = wq.loadModel(
        siteNoLst, outNameLSTM, codeLst)
    corrMat, rmseMat = wq.dictErr(dictLSTM, dictWRTDS, dictObs, codeLst)
    # load basin attributes
    dfG = gageII.readData(siteNoLst=siteNoLst)
    dfG = gageII.updateRegion(dfG)
    dfG = gageII.updateCode(dfG)

t = dictObs[siteNoLst[0]].index.values
tt = np.datetime64('2010-01-01')
t0 = np.datetime64('1980-01-01')
ind1 = np.where((t < tt) & (t >= t0))[0]
ind2 = np.where(t >= tt)[0]

# caluculate interval
if False:
    intMatC = np.full([len(siteNoLst), len(codeLst), 4], np.nan)
    for k, siteNo in enumerate(siteNoLst):
        dfC = dictObs[siteNo]
        print('\t {}/{}'.format(k, len(siteNoLst)), end='\r')
        for j, code in enumerate(codeLst):
            tC = dfC.iloc[ind1][code].dropna().index.values
コード例 #6
0
ファイル: wrapSulfateNE2.py プロジェクト: sadeghst/geolearn
dfP = pd.DataFrame(columns=varPLst)
for k in range(len(tab)):
    t1 = pd.to_datetime(tab.iloc[k]['dateon']).date()
    t2 = pd.to_datetime(tab.iloc[k]['dateoff']).date()
    tt = pd.date_range(t1, t2)[:-1]
    data = np.tile(tab.iloc[k][varPLst].values, [len(tt), 1])
    tabTemp = pd.DataFrame(index=tt, columns=varPLst, data=data)
    dfP = dfP.append(tabTemp)
dfP.dropna(how='all')

startDate = pd.datetime(1979, 1, 1)
endDate = pd.datetime(2019, 12, 31)

# gageII
tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst)
tabG = gageII.updateCode(tabG)

# read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY]
fLst = list()  # forcing ts
gLst = list()  # geo-const
qLst = list()  # streamflow
cLst = list()  # water quality
cfLst = list()  # water quality flags
infoLst = list()
t0 = time.time()
for i, siteNo in enumerate(siteNoLst):
    t1 = time.time()
    dfC, dfCF = usgs.readSample(siteNo,
                                codeLst=varC,
                                startDate=startDate,
                                flag=2)
コード例 #7
0
def wrapData(caseName,
             siteNoLst,
             rho=365,
             nFill=5,
             varC=usgs.varC,
             varG=gageII.lstWaterQuality):
    """ wrap up input and target data for the model,as:
    x=[nT,nP,nX]
    y=[nP,nY]
    c=[nP,nC]
    where nP is number of time series
    Arguments:
        caseName {str} -- name of current data case
        siteNoLst {list} -- list of USGS site
    Keyword Arguments:
        rho {int} -- [description] (default: {365})
        nFill {int} -- max number of continous nan to interpolate in input data (default: {5})
        varC {list} -- list of water quality code to learn (default: {usgs.lstCodeSample})
        varG {list} -- list of constant variables in gageII (default: {gageII.lstWaterQuality})
        varQ and varF are fixed so far
    """
    # add a start/end date to improve efficiency.
    startDate = pd.datetime(1979, 1, 1)
    endDate = pd.datetime(2019, 12, 31)

    # gageII
    tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst)
    tabG = gageII.updateCode(tabG)

    # read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY]
    fLst = list()  # forcing ts
    gLst = list()  # geo-const
    qLst = list()  # streamflow
    cLst = list()  # water quality
    cfLst = list()  # water quality flags
    infoLst = list()
    t0 = time.time()
    for i, siteNo in enumerate(siteNoLst):
        t1 = time.time()
        dfC, dfCF = usgs.readSample(siteNo,
                                    codeLst=varC,
                                    startDate=startDate,
                                    flag=2)
        dfQ = usgs.readStreamflow(siteNo, startDate=startDate)
        dfF = gridMET.readBasin(siteNo)
        for k in range(len(dfC)):
            ct = dfC.index[k]
            ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct)
            if (ctR[0] < startDate) or (ctR[-1] > endDate):
                continue
            tempQ = pd.DataFrame({
                'date': ctR
            }).set_index('date').join(dfQ).interpolate(limit=nFill,
                                                       limit_direction='both')
            tempF = pd.DataFrame({
                'date': ctR
            }).set_index('date').join(dfF).interpolate(limit=nFill,
                                                       limit_direction='both')
            qLst.append(tempQ.values)
            fLst.append(tempF.values)
            cLst.append(dfC.iloc[k].values)
            cfLst.append(dfCF.iloc[k].values)
            gLst.append(tabG.loc[siteNo].values)
            infoLst.append(dict(siteNo=siteNo, date=ct))
        t2 = time.time()
        print('{} on site {} reading {:.3f} total {:.3f}'.format(
            i, siteNo, t2 - t1, t2 - t0))
    q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    c = np.stack(cLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    cf = np.stack(cfLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    infoDf = pd.DataFrame(infoLst)
    # add runoff
    runoff = calRunoff(q[:, :, 0], infoDf)
    q = np.stack([q[:, :, 0], runoff], axis=-1).astype(np.float32)
    saveFolder = os.path.join(kPath.dirWQ, 'trainData')
    saveName = os.path.join(saveFolder, caseName)
    np.savez(saveName, q=q, f=f, c=c, g=g, cf=cf)
    infoDf.to_csv(saveName + '.csv')
    dictData = dict(name=caseName,
                    rho=rho,
                    nFill=nFill,
                    varG=varG,
                    varC=varC,
                    varQ=['00060', 'runoff'],
                    varF=gridMET.varLst,
                    siteNoLst=siteNoLst)
    with open(saveName + '.json', 'w') as fp:
        json.dump(dictData, fp, indent=4)
コード例 #8
0
# ts map of single dataset, label and code
freq = 'W'
dirRoot1 = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS_weekly')
dirRoot2 = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS_weekly_rmq')

code = '00955'
dfRes1 = pd.read_csv(os.path.join(dirRoot1, 'result', code),
                     dtype={
                         'siteNo': str
                     }).set_index('siteNo')
dfRes2 = pd.read_csv(os.path.join(dirRoot2, 'result', code),
                     dtype={
                         'siteNo': str
                     }).set_index('siteNo')
dfGeo = gageII.readData(siteNoLst=dfRes1.index.tolist())
dfGeo = gageII.updateCode(dfGeo)

# select number of sites
countS = np.sort(dfRes1['count'].values)[::-1]
fig, ax = plt.subplots(1, 1)
ax.plot(np.arange(len(countS)), countS, '-*')
fig.show()

# plot map
nS = 200
dfR1 = dfRes1[dfRes1['count'] > nS]
siteNoLst = dfR1.index.tolist()
dfR2 = dfRes2.loc[siteNoLst]
dfG = dfGeo.loc[siteNoLst]

# crd
コード例 #9
0
from hydroDL.data import gageII
import numpy as np
import pandas as pd
import os

varLst = ['ECO2_BAS_DOM', 'ECO3_BAS_DOM']
dfR = gageII.readData(varLst=varLst)
dfR = gageII.updateCode(dfR)

fileEco3 = r'C:\Users\geofk\work\map\ecoRegion\tabEco3.csv'
tabEco3 = pd.read_csv(fileEco3)

fileLookup = os.path.join(gageII.dirTab, 'conterm_x_ecoregion3_names.csv')
tabLookup = pd.read_csv(fileLookup)

len(np.sort(dfR['ECO3_BAS_DOM'].unique()))
codeLst = list(range(1, 85))
dfT = pd.DataFrame(index=codeLst, columns=['Eco2', 'Eco3', 'Eco3_Name'])
for code in codeLst:
    eco2 = dfR[dfR['ECO3_BAS_DOM'] == code]['ECO2_BAS_DOM'].unique()
    eco3Name = tabLookup[tabLookup['ECO3_CODE'] == code]['ECO3_NAME'].values
    if len(eco3Name) == 1:
        eco3 = tabEco3[tabEco3['NA_L3NAME'] == eco3Name[0]]['NA_L3CODE'].values
        dfT.at[code, 'Eco3_Name'] = eco3Name[0]
    if len(eco2) == 1:
        dfT.at[code, 'Eco2'] = eco2[0]
    if len(eco3) == 1:
        dfT.at[code, 'Eco3'] = eco3[0]

fileT = os.path.join(gageII.dirTab, 'EcoTab.csv')
dfT.to_csv(fileT)
コード例 #10
0
    fig.suptitle(title)
    fig.show()
    fig.savefig(os.path.join(figFolder, figName))

siteNoLst = wqData.info['siteNo'].unique().tolist()
dfHBN = pd.read_csv(os.path.join(kPath.dirData, 'USGS', 'inventory',
                                 'HBN.csv'),
                    dtype={
                        'siteNo': str
                    }).set_index('siteNo')
siteNoHBN = [siteNo for siteNo in dfHBN.index.tolist() if siteNo in siteNoLst]
dropColLst = [
    'STANAME', 'WR_REPORT_REMARKS', 'ADR_CITATION', 'SCREENING_COMMENTS'
]
dfX = gageII.readData(siteNoLst=siteNoLst).drop(columns=dropColLst)
dfX = gageII.updateCode(dfX)
unitConv = 0.3048**3 * 365 * 24 * 60 * 60 / 1000**2

# area vs error
indHBN = [siteNoLst.index(siteNo) for siteNo in siteNoHBN]
area = dfX['DRAIN_SQKM'].values
errMat = errMatLst2[0]
code = '00605'
# code = '00955'
err = errMat[:, wqData.varC.index(code), 1]
fig, ax = plt.subplots(1, 1)
ax.plot(area, err, 'b*')
ax.plot(area[indHBN], err[indHBN], 'r*')
# np.nanmedian(err)
# np.nanmedian(err[indHBN, :])
fig.show()
コード例 #11
0
ファイル: dataPrep.py プロジェクト: sadeghst/geolearn
from hydroDL import kPath
from hydroDL.app import waterQuality
from hydroDL.data import gageII
import pandas as pd
import numpy as np
import os
import time

# all gages
fileSiteNo = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteNoLst-1979')
siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist()

tabSel = gageII.readData(
    varLst=['CLASS'], siteNoLst=siteNoLstAll)
tabSel = gageII.updateCode(tabSel)
siteNoLst = tabSel[tabSel['CLASS'] == 1].index.tolist()

# wqData = waterQuality.DataModelWQ.new('basinRef', siteNoLst)
wqData = waterQuality.DataModelWQ('basinRef')

# indYr1 = waterQuality.indYr(wqData.info, yrLst=[1979, 2000])[0]
# wqData.saveSubset('Y8090', indYr1)
# indYr2 = waterQuality.indYr(wqData.info, yrLst=[2000, 2020])[0]
# wqData.saveSubset('Y0010', indYr2)

indYrO, indYrE = waterQuality.indYrOddEven(wqData.info)
wqData.saveSubset('Yodd', indYrO)
wqData.saveSubset('Yeven', indYrE)
コード例 #12
0
def wrapData(caseName, siteNoLst, rho=365, freq='D', optC='end'):
    """ wrap up input and target data for the model,as:
    x=[nT,nP,nX]
    y=[nP,nY]
    c=[nP,nC]
    where nP is number of time series
    Arguments:
        caseName {str} -- name of current data case
        siteNoLst {list} -- list of USGS site
    Keyword Arguments:
        rho {int} -- [description] (default: {365})
        nFill {int} -- max number of continous nan to interpolate in input data (default: {5})
        varC {list} -- list of water quality code to learn (default: {usgs.lstCodeSample})
        varG {list} -- list of constant variables in gageII (default: {gageII.lstWaterQuality})
        varQ and varF are fixed so far
    """
    sd = np.datetime64('1979-01-01')
    ed = np.datetime64('2019-12-31')
    # ts data
    varF = gridMET.varLst + ntn.varLst
    varC = usgs.varC
    varQ = usgs.varQ
    varG = gageII.lstWaterQuality
    # gageII
    tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst)
    tabG = gageII.updateCode(tabG)
    # read data and merge to: x=[nT,nP,nX], xc=[nP,nY]
    fLst, qLst, cLst, gLst = [list() for x in range(4)]
    infoLst = list()
    t0 = time.time()
    for i, siteNo in enumerate(siteNoLst):
        t1 = time.time()
        varLst = varQ + varC + varF
        df = readSiteTS(siteNo, varLst=varLst, freq=freq)
        dfC = df[varC].dropna(how='all')
        for k in range(len(dfC)):
            ct = dfC.index[k]
            if freq == 'D':
                ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct)
            elif freq == 'W':
                ctR = pd.date_range(ct - pd.Timedelta(days=rho * 7 - 1),
                                    ct,
                                    freq='W-TUE')
            if (ctR[0] < sd) or (ctR[-1] > ed):
                continue
            for lst, var in zip([fLst, qLst], [varF, varQ]):
                temp = pd.DataFrame({
                    'date': ctR
                }).set_index('date').join(df[var])
                # temp = temp.interpolate(
                #     limit=nFill, limit_direction='both', limit_area='inside')
                # give up interpolation after many thoughts
                lst.append(temp.values)
            if optC == 'end':
                cLst.append(dfC.iloc[k].values)
            elif optC == 'seq':
                tempC = pd.DataFrame({
                    'date': ctR
                }).set_index('date').join(df[varC])
                cLst.append(tempC.values)
            gLst.append(tabG.loc[siteNo].values)
            infoLst.append(dict(siteNo=siteNo, date=ct))
        t2 = time.time()
        print('{} on site {} reading {:.3f} total {:.3f}'.format(
            i, siteNo, t2 - t1, t2 - t0))
    f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    if optC == 'end':
        c = np.stack(cLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    elif optC == 'seq':
        c = np.stack(cLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    # save
    infoDf = pd.DataFrame(infoLst)
    saveFolder = os.path.join(kPath.dirWQ, 'trainData')
    saveName = os.path.join(saveFolder, caseName)
    np.savez(saveName, q=q, f=f, c=c, g=g)
    infoDf.to_csv(saveName + '.csv')
    dictData = dict(name=caseName,
                    rho=rho,
                    varG=varG,
                    varC=varC,
                    varQ=varQ,
                    varF=varF,
                    siteNoLst=siteNoLst)
    with open(saveName + '.json', 'w') as fp:
        json.dump(dictData, fp, indent=4)
コード例 #13
0
    siteNoLst = wqData.info.iloc[wqData.subset[trainSet]].siteNo.unique()
    dfCrd = gageII.readData(
        varLst=['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst)
    lat = dfCrd['LAT_GAGE'].values
    lon = dfCrd['LNG_GAGE'].values
    shortName = usgs.codePdf.loc[code]['shortName']
    axplot.mapPoint(axM[k], lat, lon, corrLst[k][:, 1], vRange=[0.5, 1], s=16)
    axM[k].set_title('Testing correlation of {}'.format(shortName))
    # axplot.mapPoint(axM[k], lat, lon, rmseLst[k][:, 1], s=16)
    # axM[k].set_title('Testing RMSE of {}'.format(shortName))
plt.tight_layout()
figM.show()

# get rid of 00010 and 00095
siteLst = list()
for k, code in enumerate(codeLst):
    trainSet = '{}-Y1'.format(code)
    siteNoLst = wqData.info.iloc[wqData.subset[trainSet]].siteNo.unique()
    siteLst.append(siteNoLst)
siteNoAll = np.unique(np.concatenate(siteLst))
varG = ['GEOL_REEDBUSH_DOM', 'GEOL_HUNT_DOM_CODE']
dfGeog = gageII.readData(
    varLst=varG+['LAT_GAGE', 'LNG_GAGE'], siteNoLst=siteNoLst)
dfGeog = gageII.updateCode(dfGeog)
lat = dfGeog['LAT_GAGE'].values
lon = dfGeog['LNG_GAGE'].values
figM, axM = plt.subplots(len(varG), 1, figsize=(6, 8))
for k, var in enumerate(varG):
    axplot.mapPoint(axM[k], lat, lon, dfGeog[var], s=16)
    axM[k].set_title(var)
figM.show()
コード例 #14
0
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import scipy
from mpl_toolkits import basemap

dirSel = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteSel')
with open(os.path.join(dirSel, 'dictRB_Y30N5.json')) as f:
    dictSite = json.load(f)
siteNoLst = dictSite['comb']
dfCrd1 = gageII.readData(
    varLst=['LAT_GAGE', 'LNG_GAGE', 'CLASS'], siteNoLst=siteNoLst)
dfCrd1 = gageII.updateCode(dfCrd1)


dirSel = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteSel')
with open(os.path.join(dirSel, 'dictRB_Y30N5.json')) as f:
    dictSiteN5 = json.load(f)
with open(os.path.join(dirSel, 'dictRB_Y30N2.json')) as f:
    dictSiteN2 = json.load(f)
codeLst = sorted(usgs.newC)
dictSite = dict()
for code in usgs.newC+['comb']:
    siteNoCode = list(set(dictSiteN2[code])-set(dictSiteN5['comb']))
    dictSite[code] = siteNoCode
siteNoLst = dictSite['comb']
nSite = len(siteNoLst)
dfCrd2 = gageII.readData(
コード例 #15
0
ファイル: box_LW_ref.py プロジェクト: sadeghst/geolearn
corrMat = np.full([len(siteNoLst), len(codeLst), 3], np.nan)
for ic, code in enumerate(codeLst):
    for siteNo in dictSite[code]:
        indS = siteNoLst.index(siteNo)
        v1 = dictLSTM[siteNo][code].iloc[ind2].values
        v2 = dictWRTDS[siteNo][code].iloc[ind2].values
        v3 = dictObs[siteNo][code].iloc[ind2].values
        rmse1, corr1 = utils.stat.calErr(v1, v2)
        rmse2, corr2 = utils.stat.calErr(v1, v3)
        rmse3, corr3 = utils.stat.calErr(v2, v3)
        corrMat[indS, ic, 0] = corr1
        corrMat[indS, ic, 1] = corr2
        corrMat[indS, ic, 2] = corr3

dfRef = gageII.readData(varLst=['CLASS'], siteNoLst=siteNoLst)
dfRef = gageII.updateCode(dfRef)
indRef = np.where(dfRef['CLASS'].values == 1)[0]
indNonRef = np.where(dfRef['CLASS'].values == 0)[0]

# plot box
labLst1 = [
    usgs.codePdf.loc[code]['shortName'] + '\n' + code for code in codeLst
]
labLst2 = ['LSTM vs WRTDS', 'LSTM vs Obs', 'WRTDS vs Obs']
dataBox = list()
for k in range(len(codeLst)):
    code = codeLst[k]
    temp = list()
    for i in [0, 1, 2]:
        temp.append(corrMat[indRef, k, i])
    dataBox.append(temp)
コード例 #16
0
        if len(wqData.c.shape) == 3:
            p = yP[-1, :, master['varY'].index(code)]
            o = wqData.c[-1, ind, ic]
        elif len(wqData.c.shape) == 2:
            p = ycP[:, master['varYC'].index(code)]
            o = wqData.c[ind, ic]
        for siteNo in dictSite[code]:
            iS = siteNoLst.index(siteNo)
            indS = info[info['siteNo'] == siteNo].index.values
            rmse, corr = utils.stat.calErr(p[indS], o[indS])
            corrMat[iS, iCode, iT] = corr
            rmseMat[iS, iCode, iT] = rmse

# reference basins
tabRef = gageII.readData(varLst=['CLASS'], siteNoLst=siteNoLst)
tabRef = gageII.updateCode(tabRef)
bRef = (tabRef['CLASS'] == 1).values
ind1 = np.where(bRef)[0]
ind2 = np.where(~bRef)[0]

# plot box
labLst1 = [
    usgs.codePdf.loc[code]['shortName'] + '\n' + code for code in codeLst
]
labLst2 = ['train-ref', 'train-nonref', 'test-ref', 'test-nonref']

dataBox = list()
for k in range(len(codeLst)):
    code = codeLst[k]
    temp = list()
    for i in range(corrMat.shape[2]):