def readSiteTS(siteNo, varLst, freq='D', area=None, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2019-12-31'), rmFlag=True): # read data td = pd.date_range(sd, ed) varC = list(set(varLst).intersection(usgs.varC)) varQ = list(set(varLst).intersection(usgs.varQ)) varF = list(set(varLst).intersection(gridMET.varLst)) varP = list(set(varLst).intersection(ntn.varLst)) varR = list(set(varLst).intersection(GLASS.varLst)) varT = list(set(varLst).intersection(varTLst)) dfD = pd.DataFrame({'date': td}).set_index('date') if len(varC) > 0: if rmFlag: dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=sd, flag=2) dfC = usgs.removeFlag(dfC, dfCF) else: dfC = usgs.readSample(siteNo, codeLst=varC, startDate=sd) dfD = dfD.join(dfC) if len(varQ) > 0: dfQ = usgs.readStreamflow(siteNo, startDate=sd) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) if 'runoff' in varLst: if area is None: tabArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=[siteNo]) area = tabArea['DRAIN_SQKM'].values[0] dfQ['runoff'] = calRunoffArea(dfQ['00060'], area) dfD = dfD.join(dfQ) if len(varF) > 0: dfF = gridMET.readBasin(siteNo, varLst=varF) dfD = dfD.join(dfF) if len(varP) > 0: dfP = ntn.readBasin(siteNo, varLst=varP, freq='D') dfD = dfD.join(dfP) if len(varR) > 0: dfR = GLASS.readBasin(siteNo, varLst=varR, freq='D') dfD = dfD.join(dfR) if len(varT) > 0: t = dfD.index.values matT, _ = calT(t) dfT = pd.DataFrame(index=t, columns=varTLst, data=matT) dfD = dfD.join(dfT[varT]) dfD = dfD[varLst] if freq == 'D': return dfD elif freq == 'W': dfW = dfD.resample('W-TUE').mean() return dfW
def readSiteX(siteNo, varX, area=None, nFill=5, sd=np.datetime64('1979-01-01'), ed=np.datetime64('2020-01-01')): tr = pd.date_range(sd, ed) dfX = pd.DataFrame({'date': tr}).set_index('date') # extract data dfF = gridMET.readBasin(siteNo) if '00060' in varX or 'runoff' in varX: dfQ = usgs.readStreamflow(siteNo, startDate=sd) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) if 'runoff' in varX: if area is None: tabArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=[siteNo]) area = tabArea['DRAIN_SQKM'].values[0] dfQ['runoff'] = calRunoffArea(dfQ['00060'], area) dfX = dfX.join(dfQ) dfX = dfX.join(dfF) dfX = dfX[varX] dfX = dfX.interpolate(limit=nFill, limit_direction='both') return dfX
# read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY] fLst = list() # forcing ts gLst = list() # geo-const qLst = list() # streamflow cLst = list() # water quality cfLst = list() # water quality flags infoLst = list() t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=startDate, flag=2) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) dfF = gridMET.readBasin(siteNo) for k in range(len(dfC)): ct = dfC.index[k] ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct) if (ctR[0] < startDate) or (ctR[-1] > endDate): continue tempQ = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfQ).interpolate(limit=nFill, limit_direction='both') tempF = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfF).join(dfP).interpolate( limit=nFill, limit_direction='both') qLst.append(tempQ.values) fLst.append(tempF.values)
def wrapData(caseName, siteNoLst, rho=365, nFill=5, varC=usgs.varC, varG=gageII.lstWaterQuality): """ wrap up input and target data for the model,as: x=[nT,nP,nX] y=[nP,nY] c=[nP,nC] where nP is number of time series Arguments: caseName {str} -- name of current data case siteNoLst {list} -- list of USGS site Keyword Arguments: rho {int} -- [description] (default: {365}) nFill {int} -- max number of continous nan to interpolate in input data (default: {5}) varC {list} -- list of water quality code to learn (default: {usgs.lstCodeSample}) varG {list} -- list of constant variables in gageII (default: {gageII.lstWaterQuality}) varQ and varF are fixed so far """ # add a start/end date to improve efficiency. startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2019, 12, 31) # gageII tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst) tabG = gageII.updateCode(tabG) # read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY] fLst = list() # forcing ts gLst = list() # geo-const qLst = list() # streamflow cLst = list() # water quality cfLst = list() # water quality flags infoLst = list() t0 = time.time() for i, siteNo in enumerate(siteNoLst): t1 = time.time() dfC, dfCF = usgs.readSample(siteNo, codeLst=varC, startDate=startDate, flag=2) dfQ = usgs.readStreamflow(siteNo, startDate=startDate) dfF = gridMET.readBasin(siteNo) for k in range(len(dfC)): ct = dfC.index[k] ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct) if (ctR[0] < startDate) or (ctR[-1] > endDate): continue tempQ = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfQ).interpolate(limit=nFill, limit_direction='both') tempF = pd.DataFrame({ 'date': ctR }).set_index('date').join(dfF).interpolate(limit=nFill, limit_direction='both') qLst.append(tempQ.values) fLst.append(tempF.values) cLst.append(dfC.iloc[k].values) cfLst.append(dfCF.iloc[k].values) gLst.append(tabG.loc[siteNo].values) infoLst.append(dict(siteNo=siteNo, date=ct)) t2 = time.time() print('{} on site {} reading {:.3f} total {:.3f}'.format( i, siteNo, t2 - t1, t2 - t0)) q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32) f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32) g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32) c = np.stack(cLst, axis=-1).swapaxes(0, 1).astype(np.float32) cf = np.stack(cfLst, axis=-1).swapaxes(0, 1).astype(np.float32) infoDf = pd.DataFrame(infoLst) # add runoff runoff = calRunoff(q[:, :, 0], infoDf) q = np.stack([q[:, :, 0], runoff], axis=-1).astype(np.float32) saveFolder = os.path.join(kPath.dirWQ, 'trainData') saveName = os.path.join(saveFolder, caseName) np.savez(saveName, q=q, f=f, c=c, g=g, cf=cf) infoDf.to_csv(saveName + '.csv') dictData = dict(name=caseName, rho=rho, nFill=nFill, varG=varG, varC=varC, varQ=['00060', 'runoff'], varF=gridMET.varLst, siteNoLst=siteNoLst) with open(saveName + '.json', 'w') as fp: json.dump(dictData, fp, indent=4)
# read data td = pd.date_range(sd, ed) varC = list(set(varLst).intersection(usgs.varC)) varQ = list(set(varLst).intersection(usgs.varQ)) varF = list(set(varLst).intersection(gridMET.varLst)) varP = list(set(varLst).intersection(ntn.varLst)) dfD = pd.DataFrame({'date': td}).set_index('date') if len(varC) > 0: dfC = usgs.readSample(siteNo, codeLst=varC, startDate=sd) dfD = dfD.join(dfC) if len(varQ) > 0: dfQ = usgs.readStreamflow(siteNo, startDate=sd) dfQ = dfQ.rename(columns={'00060_00003': '00060'}) if len(varF) > 0: dfF = gridMET.readBasin(siteNo, varLst=varF) if len(varP) > 0: dfP = ntn.readBasin(siteNo, varLst=varP, freq=freq) # extract data dfD = pd.DataFrame({'date': td}).set_index('date') if 'runoff' in varLst: if area is None: tabArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=[siteNo]) area = tabArea['DRAIN_SQKM'].values[0] dfQ['runoff'] = waterQuality.calRunoffArea(dfQ['00060'], area) dfD = dfD.join(dfQ) dfD = dfD.join(dfF) dfD = dfD.join(dfC) dfD = dfD.join(dfP) dfD = dfD[varLst]