def funcPoint(iP, axP): siteNo = siteNoLstP[iP] dfPred1, _ = basins.loadSeq(outLst[0], siteNo) dfPred2, _ = basins.loadSeq(outLst[1], siteNo) sd = np.datetime64('1980-01-01') dfQ = waterQuality.readSiteY(siteNo, ['00060'], sd=sd) dfC = waterQuality.readSiteY(siteNo, codeSel + [code + '_cd' for code in codeSel], sd=sd) dfPred1 = dfPred1[dfPred1.index >= sd] dfPred2 = dfPred2[dfPred2.index >= sd] dfPred1 = dfPred1.multiply(dfPred1['00060'], axis='index') dfPred2 = dfPred2.multiply(dfPred2['00060'], axis='index') dfC[codeSel] = dfC[codeSel].multiply(dfQ['00060'], axis='index') t = dfPred1.index.values.astype(np.datetime64) # axplot.plotTS(axP[0], t, [dfPred1['00060'], dfQ['00060']], tBar=tBar, # legLst=['pred-opt1', 'obs'], styLst='--', cLst='br') # axP[0].set_title('{} streamflow'.format(siteNo)) for k, var in enumerate(codeSel): shortName = codePdf.loc[var]['shortName'] title = '{} {} {}'.format(siteNo, shortName, var) styLst = ['-', '-', '*', '*', '*', '*'] legLst = [ 'model odd', 'model even', 'obs odd', 'obs even', 'flag even', 'flag odd' ] yr = dfC.index.year c1 = dfC[var].values.copy() c2 = dfC[var].values.copy() f1 = dfC[var].values.copy() f2 = dfC[var].values.copy() vf = dfC[var + '_cd'].values c1[(vf != 'x') & (vf != 'X')] = np.nan c1[(yr % 2 == 0)] = np.nan c2[(vf != 'x') & (vf != 'X')] = np.nan c2[(yr % 2 == 1)] = np.nan f1[(vf == 'x') | (vf == 'X') | (yr % 2 == 0)] = np.nan f2[(vf == 'x') | (vf == 'X') | (yr % 2 == 1)] = np.nan data = [dfPred1[var].values, dfPred2[var].values, c1, c2, f1, f2] axplot.plotTS(axP[k], t, data, styLst=styLst, cLst='bgrmkk', legLst=legLst) axP[k].set_title(title)
def funcPoint(iP, axP): siteNo = siteNoLst[iP] dfC = waterQuality.readSiteY(siteNo, [code]) t = dfC.index.values.astype(np.datetime64) tBar = np.datetime64('2000-01-01') axplot.plotTS(axP, t, dfC[code], styLst='*', tBar=tBar) n1 = dfC[dfC[code].index < tBar].count().values n2 = dfC[dfC[code].index >= tBar].count().values axP.set_title('{} #samples = {} {}'.format(siteNo, n1, n2))
def funcPoint(iP, axP): siteNo = siteNoLstP[iP] dfPred1, _ = basins.loadSeq(outName, siteNo, ep=ep) dfPred2 = pd.read_csv(os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS', 'Yodd', siteNo), index_col=None) ctR = pd.date_range(pd.datetime(1979, 1, 1), pd.datetime(2020, 1, 1)) dfPred2.index = ctR dfPred2.index.name = 'date' sd = np.datetime64('1980-01-01') dfQ = waterQuality.readSiteY(siteNo, ['00060'], sd=sd) dfC = waterQuality.readSiteY( siteNo, codeSel+[code+'_cd' for code in codeSel], sd=sd) dfPred1 = dfPred1[dfPred1.index >= sd] dfPred2 = dfPred2[dfPred2.index >= sd] t = dfPred1.index.values.astype(np.datetime64) # axplot.plotTS(axP[0], t, [dfPred1['00060'], dfQ['00060']], tBar=tBar, # legLst=['pred-opt1', 'obs'], styLst='--', cLst='br') # axP[0].set_title('{} streamflow'.format(siteNo)) for k, var in enumerate(codeSel): shortName = codePdf.loc[var]['shortName'] title = '{} {} {}'.format(siteNo, shortName, var) styLst = ['-', '-', '*', '*', '*', '*'] legLst = ['LSTM', 'WRTDS', 'obs odd', 'obs even', 'flag even', 'flag odd'] yr = dfC.index.year c1 = dfC[var].values.copy() c2 = dfC[var].values.copy() f1 = dfC[var].values.copy() f2 = dfC[var].values.copy() vf = dfC[var+'_cd'].values c1[(vf != 'x') & (vf != 'X')] = np.nan c1[(yr % 2 == 0)] = np.nan c2[(vf != 'x') & (vf != 'X')] = np.nan c2[(yr % 2 == 1)] = np.nan f1[(vf == 'x') | (vf == 'X') | (yr % 2 == 0)] = np.nan f2[(vf == 'x') | (vf == 'X') | (yr % 2 == 1)] = np.nan data = [dfPred1[var].values, dfPred2[var].values, c1, c2, f1, f2] axplot.plotTS(axP[k], t, data, styLst=styLst, cLst='bgrmkk', legLst=legLst) axP[k].set_title(title)
def funcPoint(iP, axP): siteNo = siteNoLstP[iP] tBar = np.datetime64('2000-01-01') dfPred1, _ = basins.loadSeq(outLst[0], siteNo) dfPred2, _ = basins.loadSeq(outLst[1], siteNo) sd = np.datetime64('1980-01-01') dfQ = waterQuality.readSiteY(siteNo, ['00060'], sd=sd) dfC = waterQuality.readSiteY(siteNo, codeSel + [code + '_cd' for code in codeSel], sd=sd) dfPred1 = dfPred1[dfPred1.index >= sd] dfPred2 = dfPred2[dfPred2.index >= sd] t = dfPred1.index.values.astype(np.datetime64) axplot.plotTS(axP[0], t, [dfPred1['00060'], dfQ['00060']], tBar=tBar, legLst=['pred-opt1', 'obs'], styLst='--', cLst='br') axP[0].set_title('{} streamflow'.format(siteNo)) for k, var in enumerate(codeSel): shortName = codePdf.loc[var]['shortName'] title = ' {} {}'.format(shortName, var) styLst = ['-', '-', '*', '*'] vc = dfC[var].values.copy() vf = dfC[var + '_cd'].values vcf = dfC[var].values.copy() vcf[(vf == 'x') | (vf == 'X')] = np.nan data = [dfPred1[var].values, dfPred2[var].values, vc, vcf] axplot.plotTS(axP[k + 1], t, data, tBar=tBar, legLst=['pred', 'pred-rmFlag', 'obs', 'obs-flag'], styLst=styLst, cLst='bgrk') axP[k + 1].set_title(title)
def funcPoint(iP, axP): siteNo = siteNoLst[iP] dfY = waterQuality.readSiteY(siteNo, ['00955']) dfY = dfY.dropna() dfX = waterQuality.readSiteX(siteNo, varX) t = dfY.index y = dfY['00955'].values corrMat = np.zeros([nt, nx]) for k in range(nt): x = dfX.loc[t.values - np.timedelta64(k, 'D')].values ind = np.where(~np.isnan(x))[0] for i in range(nx): corrMat[k, i] = np.corrcoef(x[ind, i], y[ind])[0, 1] axP[0].plot(dfX['00060'], '-b', label='streamflow') axP[1].plot(dfY, '-*r', label='silica') axP[2].plot(np.arange(nt), corrMat[:, 1].T, '-*') axP[2].set_ylabel('correlation') axP[2].set_xlabel('lag day')
import importlib import pandas as pd import numpy as np import os import time import scipy.signal as signal wqData = waterQuality.DataModelWQ('Silica64') siteNoLst = wqData.siteNoLst for siteNo in siteNoLst: print(siteNo) dfObs = waterQuality.readSiteY(siteNo, ['00955']) # rm outlier df = dfObs[dfObs['00955'].notna().values] y = df['00955'].values yV = y[y < np.percentile(y, 99)] yV = yV[yV > np.percentile(y, 1)] ul = np.mean(yV) + np.std(yV) * 5 dfObs[dfObs['00955'] > ul] = np.nan # fourier df = dfObs[dfObs.notna().values] tt = dfObs.index.values xx = (tt.astype('datetime64[D]') - np.datetime64('1979-01-01')).astype( np.float) t = df.index.values x = (t.astype('datetime64[D]') - np.datetime64('1979-01-01')).astype( np.float)
def funcPoint(iP, axP): siteNo = siteNoLst[iP] dfC = waterQuality.readSiteY(siteNo, [code]) t = dfC.index.values.astype(np.datetime64) axplot.plotTS(axP, t, dfC[code], styLst='*') axP.set_title('{} #samples = {}'.format(siteNo, dfC.count().values))
import time import numpy as np import pandas as pd import matplotlib.pyplot as plt import torch.nn as nn from hydroDL.model import rnn, crit import os siteNo = '01434025' # siteNo = '01364959' codeLst = ['00915', '00940', '00955'] varX = gridMET.varLst varY = ['00060'] dfX = waterQuality.readSiteX(siteNo, varX) dfY = waterQuality.readSiteY(siteNo, varY) mtdX = waterQuality.extractVarMtd(varX) normX, statX = transform.transInAll(dfX.values, mtdX) dfXN = pd.DataFrame(data=normX, index=dfX.index, columns=dfX.columns) mtdY = waterQuality.extractVarMtd(varY) normY, statY = transform.transInAll(dfY.values, mtdY) dfYN = pd.DataFrame(data=normY, index=dfY.index, columns=dfY.columns) matX1 = dfXN[dfXN.index < np.datetime64('2000-01-01')].values matY1 = dfYN[dfYN.index < np.datetime64('2000-01-01')].values matX2 = dfXN[dfXN.index >= np.datetime64('2000-01-01')].values matY2 = dfYN[dfYN.index >= np.datetime64('2000-01-01')].values matX = dfXN.values matY = dfYN.values
def loadSeq(siteNo, varY, model, optX='F', optT='Y8090', order=(5, 0, 5)): if model == 'ARMA': dirAR = os.path.join(kPath.dirWQ, 'modelStat', 'ARMA') strOrder = '-'.join([str(k) for k in order]) saveFolderName = '{}-{}-{}-{}'.format(optX, optT, varY, strOrder) saveFolder = os.path.join(dirAR, saveFolderName) elif model == 'LR': dirLR = os.path.join(kPath.dirWQ, 'modelStat', 'LR') saveFolderName = '{}-{}-{}'.format(optX, optT, varY) saveFolder = os.path.join(dirLR, saveFolderName) else: raise Exception('model {} invalid!'.format(model)) predFile = os.path.join(saveFolder, siteNo) if not os.path.exists(saveFolder): os.mkdir(saveFolder) if os.path.exists(predFile): dfP = pd.read_csv(predFile, index_col=None) dfP = utils.time.datePdf(dfP) else: if optX == 'F': varX = gridMET.varLst elif optX == 'QF': varX = ['00060'] + gridMET.varLst else: raise Exception('optX {} invalid!'.format(optX)) dfX = waterQuality.readSiteX(siteNo, varX) dfY = waterQuality.readSiteY(siteNo, [varY]) # normalize mtdX = waterQuality.extractVarMtd(varX) normX, statX = transform.transInAll(dfX.values, mtdX) dfXN = pd.DataFrame(data=normX, index=dfX.index, columns=dfX.columns) mtdY = waterQuality.extractVarMtd([varY]) normY, statY = transform.transInAll(dfY.values, mtdY) dfYN = pd.DataFrame(data=normY, index=dfY.index, columns=dfY.columns) if optT == 'Y8090': dfXT = dfXN[dfXN.index < np.datetime64('2000-01-01')] dfYT = dfYN[dfYN.index < np.datetime64('2000-01-01')] elif optT == 'Y0010': dfXT = dfXN[dfXN.index >= np.datetime64('2000-01-01')] dfYT = dfYN[dfYN.index >= np.datetime64('2000-01-01')] else: raise Exception('optT {} invalid!'.format(optT)) # train and test if model == 'ARMA': dfPN, resT = trainARMA(dfXT, dfYT, dfXN, dfYN, order) if model == 'LR': dfPN = trainLR(dfXT, dfYT, dfXN, dfYN) yP = transform.transOut(dfPN.values, mtdY[0], statY[0]) dfP = pd.DataFrame(data=yP, index=dfYN.index, columns=dfYN.columns) # save result, model, stat dfP.reset_index().to_csv(predFile, index=False) statFile = os.path.join(saveFolder, siteNo + '_stat.json') with open(statFile, 'w') as fp: json.dump(dict(statX=statX, statY=statY), fp, indent=4) # save model # if model == 'ARMA': # modelFile = os.path.join(saveFolder, siteNo+'_model.p') # resT.save(modelFile) return dfP
code = '00955' # silica num > 100 in both training and testing (named silica64) siteNoLst = df0[(df1[code] > 100) & (df2[code] > 100)].index.tolist() if not waterQuality.exist('Silica64'): wqData = waterQuality.DataModelWQ.new('Silica64', siteNoLst) wqData = waterQuality.DataModelWQ('Silica64') indYr1 = waterQuality.indYr(wqData.info, yrLst=[1979, 2000])[0] # wqData.saveSubset('Y8090', indYr1) indYr2 = waterQuality.indYr(wqData.info, yrLst=[2000, 2020])[0] # wqData.saveSubset('Y0010', indYr2) # subset only have silica ic = wqData.varC.index(code) indC = np.where(~np.isnan(wqData.c[:, ic]))[0] wqData.saveSubset(code, indC) indYr1 = waterQuality.indYr(wqData.info.iloc[indC], yrLst=[1979, 2000])[0] # wqData.saveSubset('{}-Y8090'.format(code), indYr1) indYr2 = waterQuality.indYr(wqData.info.iloc[indC], yrLst=[2000, 2020])[0] # wqData.saveSubset('{}-Y0010'.format(code), indYr2) figP, axP = plt.subplots(5, 1, figsize=(8, 6)) for k in range(5): kk = k + 5 siteNo = siteNoLstAll[ind[kk]] dfC = waterQuality.readSiteY(siteNo, [code]) t = dfC.index.values.astype(np.datetime64) axplot.plotTS(axP[k], t, dfC['00955'], styLst='*') axP[k].set_title('{} #samples = {}'.format(siteNo, dfC.count().values[0])) figP.show()
import numpy as np import pandas as pd import matplotlib.pyplot as plt import os import statsmodels.api as sm from scipy import stats import scipy siteNo = '401733105392404' # siteNo = '01364959' codeLst = ['00915', '00955'] varX = gridMET.varLst varY = ['00060'] dfX = waterQuality.readSiteX(siteNo, varX) dfY = waterQuality.readSiteY(siteNo, varY) dfC = waterQuality.readSiteY(siteNo, codeLst) x = dfX['pr'].values xA = dfX.values y = dfY['00060'].values nt = len(x) rho = 365 matX = np.ones([nt - rho, rho + 7]) for k in range(rho): matX[:, k] = x[k:nt - rho + k] for k in range(5): matX[:, rho + k] = xA[rho:, k + 2] matY = y[rho:] indV = np.where(~np.isnan(matY))[0]
siteNoLstAll = pd.read_csv(fileSiteNo, header=None, dtype=str)[0].tolist() codeLst = sorted(usgs.codeLst) doLst = list() # doLst.append('calCount') # doLst.append('calCountCorr') if 'calCount' in doLst: # calculate number of samples (all, B2000, A2000) df0 = pd.DataFrame(index=siteNoLstAll, columns=codeLst) df1 = pd.DataFrame(index=siteNoLstAll, columns=codeLst) df2 = pd.DataFrame(index=siteNoLstAll, columns=codeLst) tBar = np.datetime64('2000-01-01') for k, siteNo in enumerate(siteNoLstAll): print(k) dfC = waterQuality.readSiteY(siteNo, codeLst) df0.loc[siteNo] = dfC.count() df1.loc[siteNo] = dfC[dfC.index < tBar].count() df2.loc[siteNo] = dfC[dfC.index >= tBar].count() df0.to_csv(os.path.join(dirInv, 'codeCount.csv')) df1.to_csv(os.path.join(dirInv, 'codeCount_B2000.csv')) df2.to_csv(os.path.join(dirInv, 'codeCount_A2000.csv')) if 'calCount' in doLst: # find out two variables (hopefully one rock one bio) that are most related df0 = pd.read_csv(os.path.join(dirInv, 'codeCount.csv'), dtype={'siteNo': str}, index_col='siteNo') df1 = pd.read_csv(os.path.join(dirInv, 'codeCount_B2000.csv'), dtype={'siteNo': str}, index_col='siteNo') df2 = pd.read_csv(os.path.join(dirInv, 'codeCount_A2000.csv'), dtype={'siteNo': str}, index_col='siteNo')