Beispiel #1
0
def readSiteTS(siteNo,
               varLst,
               freq='D',
               area=None,
               sd=np.datetime64('1979-01-01'),
               ed=np.datetime64('2019-12-31'),
               rmFlag=True):
    # read data
    td = pd.date_range(sd, ed)
    varC = list(set(varLst).intersection(usgs.varC))
    varQ = list(set(varLst).intersection(usgs.varQ))
    varF = list(set(varLst).intersection(gridMET.varLst))
    varP = list(set(varLst).intersection(ntn.varLst))
    varR = list(set(varLst).intersection(GLASS.varLst))
    varT = list(set(varLst).intersection(varTLst))

    dfD = pd.DataFrame({'date': td}).set_index('date')
    if len(varC) > 0:
        if rmFlag:
            dfC, dfCF = usgs.readSample(siteNo,
                                        codeLst=varC,
                                        startDate=sd,
                                        flag=2)
            dfC = usgs.removeFlag(dfC, dfCF)
        else:
            dfC = usgs.readSample(siteNo, codeLst=varC, startDate=sd)
        dfD = dfD.join(dfC)
    if len(varQ) > 0:
        dfQ = usgs.readStreamflow(siteNo, startDate=sd)
        dfQ = dfQ.rename(columns={'00060_00003': '00060'})
        if 'runoff' in varLst:
            if area is None:
                tabArea = gageII.readData(varLst=['DRAIN_SQKM'],
                                          siteNoLst=[siteNo])
                area = tabArea['DRAIN_SQKM'].values[0]
            dfQ['runoff'] = calRunoffArea(dfQ['00060'], area)
        dfD = dfD.join(dfQ)
    if len(varF) > 0:
        dfF = gridMET.readBasin(siteNo, varLst=varF)
        dfD = dfD.join(dfF)
    if len(varP) > 0:
        dfP = ntn.readBasin(siteNo, varLst=varP, freq='D')
        dfD = dfD.join(dfP)
    if len(varR) > 0:
        dfR = GLASS.readBasin(siteNo, varLst=varR, freq='D')
        dfD = dfD.join(dfR)
    if len(varT) > 0:
        t = dfD.index.values
        matT, _ = calT(t)
        dfT = pd.DataFrame(index=t, columns=varTLst, data=matT)
        dfD = dfD.join(dfT[varT])
    dfD = dfD[varLst]
    if freq == 'D':
        return dfD
    elif freq == 'W':
        dfW = dfD.resample('W-TUE').mean()
        return dfW
Beispiel #2
0
def readSiteX(siteNo,
              varX,
              area=None,
              nFill=5,
              sd=np.datetime64('1979-01-01'),
              ed=np.datetime64('2020-01-01')):
    tr = pd.date_range(sd, ed)
    dfX = pd.DataFrame({'date': tr}).set_index('date')
    # extract data
    dfF = gridMET.readBasin(siteNo)
    if '00060' in varX or 'runoff' in varX:
        dfQ = usgs.readStreamflow(siteNo, startDate=sd)
        dfQ = dfQ.rename(columns={'00060_00003': '00060'})
        if 'runoff' in varX:
            if area is None:
                tabArea = gageII.readData(varLst=['DRAIN_SQKM'],
                                          siteNoLst=[siteNo])
                area = tabArea['DRAIN_SQKM'].values[0]
            dfQ['runoff'] = calRunoffArea(dfQ['00060'], area)
        dfX = dfX.join(dfQ)
    dfX = dfX.join(dfF)
    dfX = dfX[varX]
    dfX = dfX.interpolate(limit=nFill, limit_direction='both')
    return dfX
Beispiel #3
0
# read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY]
fLst = list()  # forcing ts
gLst = list()  # geo-const
qLst = list()  # streamflow
cLst = list()  # water quality
cfLst = list()  # water quality flags
infoLst = list()
t0 = time.time()
for i, siteNo in enumerate(siteNoLst):
    t1 = time.time()
    dfC, dfCF = usgs.readSample(siteNo,
                                codeLst=varC,
                                startDate=startDate,
                                flag=2)
    dfQ = usgs.readStreamflow(siteNo, startDate=startDate)
    dfF = gridMET.readBasin(siteNo)
    for k in range(len(dfC)):
        ct = dfC.index[k]
        ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct)
        if (ctR[0] < startDate) or (ctR[-1] > endDate):
            continue
        tempQ = pd.DataFrame({
            'date': ctR
        }).set_index('date').join(dfQ).interpolate(limit=nFill,
                                                   limit_direction='both')
        tempF = pd.DataFrame({
            'date': ctR
        }).set_index('date').join(dfF).join(dfP).interpolate(
            limit=nFill, limit_direction='both')
        qLst.append(tempQ.values)
        fLst.append(tempF.values)
Beispiel #4
0
def wrapData(caseName,
             siteNoLst,
             rho=365,
             nFill=5,
             varC=usgs.varC,
             varG=gageII.lstWaterQuality):
    """ wrap up input and target data for the model,as:
    x=[nT,nP,nX]
    y=[nP,nY]
    c=[nP,nC]
    where nP is number of time series
    Arguments:
        caseName {str} -- name of current data case
        siteNoLst {list} -- list of USGS site
    Keyword Arguments:
        rho {int} -- [description] (default: {365})
        nFill {int} -- max number of continous nan to interpolate in input data (default: {5})
        varC {list} -- list of water quality code to learn (default: {usgs.lstCodeSample})
        varG {list} -- list of constant variables in gageII (default: {gageII.lstWaterQuality})
        varQ and varF are fixed so far
    """
    # add a start/end date to improve efficiency.
    startDate = pd.datetime(1979, 1, 1)
    endDate = pd.datetime(2019, 12, 31)

    # gageII
    tabG = gageII.readData(varLst=varG, siteNoLst=siteNoLst)
    tabG = gageII.updateCode(tabG)

    # read data and merge to: f/q=[nT,nP,nX], g/c=[nP,nY]
    fLst = list()  # forcing ts
    gLst = list()  # geo-const
    qLst = list()  # streamflow
    cLst = list()  # water quality
    cfLst = list()  # water quality flags
    infoLst = list()
    t0 = time.time()
    for i, siteNo in enumerate(siteNoLst):
        t1 = time.time()
        dfC, dfCF = usgs.readSample(siteNo,
                                    codeLst=varC,
                                    startDate=startDate,
                                    flag=2)
        dfQ = usgs.readStreamflow(siteNo, startDate=startDate)
        dfF = gridMET.readBasin(siteNo)
        for k in range(len(dfC)):
            ct = dfC.index[k]
            ctR = pd.date_range(ct - pd.Timedelta(days=rho - 1), ct)
            if (ctR[0] < startDate) or (ctR[-1] > endDate):
                continue
            tempQ = pd.DataFrame({
                'date': ctR
            }).set_index('date').join(dfQ).interpolate(limit=nFill,
                                                       limit_direction='both')
            tempF = pd.DataFrame({
                'date': ctR
            }).set_index('date').join(dfF).interpolate(limit=nFill,
                                                       limit_direction='both')
            qLst.append(tempQ.values)
            fLst.append(tempF.values)
            cLst.append(dfC.iloc[k].values)
            cfLst.append(dfCF.iloc[k].values)
            gLst.append(tabG.loc[siteNo].values)
            infoLst.append(dict(siteNo=siteNo, date=ct))
        t2 = time.time()
        print('{} on site {} reading {:.3f} total {:.3f}'.format(
            i, siteNo, t2 - t1, t2 - t0))
    q = np.stack(qLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    f = np.stack(fLst, axis=-1).swapaxes(1, 2).astype(np.float32)
    g = np.stack(gLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    c = np.stack(cLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    cf = np.stack(cfLst, axis=-1).swapaxes(0, 1).astype(np.float32)
    infoDf = pd.DataFrame(infoLst)
    # add runoff
    runoff = calRunoff(q[:, :, 0], infoDf)
    q = np.stack([q[:, :, 0], runoff], axis=-1).astype(np.float32)
    saveFolder = os.path.join(kPath.dirWQ, 'trainData')
    saveName = os.path.join(saveFolder, caseName)
    np.savez(saveName, q=q, f=f, c=c, g=g, cf=cf)
    infoDf.to_csv(saveName + '.csv')
    dictData = dict(name=caseName,
                    rho=rho,
                    nFill=nFill,
                    varG=varG,
                    varC=varC,
                    varQ=['00060', 'runoff'],
                    varF=gridMET.varLst,
                    siteNoLst=siteNoLst)
    with open(saveName + '.json', 'w') as fp:
        json.dump(dictData, fp, indent=4)
Beispiel #5
0
# read data
td = pd.date_range(sd, ed)
varC = list(set(varLst).intersection(usgs.varC))
varQ = list(set(varLst).intersection(usgs.varQ))
varF = list(set(varLst).intersection(gridMET.varLst))
varP = list(set(varLst).intersection(ntn.varLst))

dfD = pd.DataFrame({'date': td}).set_index('date')
if len(varC) > 0:
    dfC = usgs.readSample(siteNo, codeLst=varC, startDate=sd)
    dfD = dfD.join(dfC)
if len(varQ) > 0:
    dfQ = usgs.readStreamflow(siteNo, startDate=sd)
    dfQ = dfQ.rename(columns={'00060_00003': '00060'})
if len(varF) > 0:
    dfF = gridMET.readBasin(siteNo, varLst=varF)
if len(varP) > 0:
    dfP = ntn.readBasin(siteNo, varLst=varP, freq=freq)

# extract data
dfD = pd.DataFrame({'date': td}).set_index('date')
if 'runoff' in varLst:
    if area is None:
        tabArea = gageII.readData(varLst=['DRAIN_SQKM'], siteNoLst=[siteNo])
        area = tabArea['DRAIN_SQKM'].values[0]
    dfQ['runoff'] = waterQuality.calRunoffArea(dfQ['00060'], area)
dfD = dfD.join(dfQ)
dfD = dfD.join(dfF)
dfD = dfD.join(dfC)
dfD = dfD.join(dfP)
dfD = dfD[varLst]