Exemple #1
0
def rd_niwa_rcp(
        base_path,
        mtypes,
        poly,
        vcsn_sites_csv=r'\\fileservices02\ManagedShares\Data\VirtualClimate\GIS\niwa_vcsn_wgs84.csv',
        id_col='Network',
        x_col='deg_x',
        y_col='deg_y',
        output_fun=None,
        export_path='output'):
    """
    Function to read in the NIWA RCP netcdf files and output the data in a specified format.
    """

    mtype_name = {
        'precip': 'TotalPrecipCorr',
        'T_max': 'MaxTempCorr',
        'T_min': 'MinTempCorr',
        'P_atmos': 'MSLP',
        'PET': 'PE',
        'RH_mean': 'RelHum',
        'R_s': 'SurfRad',
        'U_z': 'WindSpeed'
    }

    ### Import and reorganize data
    vcsn_sites = pd.read_csv(vcsn_sites_csv)[[id_col, x_col, y_col]]

    sites_gpd = vector.xy_to_gpd(id_col, x_col, y_col, vcsn_sites, 4326)
    poly1 = gpd.read_file(poly)

    sites_gpd2 = sites_gpd.to_crs(poly1.crs)

    mtypes1 = [mtype_name[i] for i in mtypes]

    ### Select sites
    sites_gpd3 = vector.sel_sites_poly(sites_gpd2, poly1)[id_col]
    site_loc1 = vcsn_sites[vcsn_sites[id_col].isin(sites_gpd3)]
    site_loc1.columns = ['id', 'x', 'y']

    ### Read and extract data from netcdf files

    for root, dirs, files in os.walk(base_path):
        files2 = [i for i in files if i.endswith('.nc')]
        files3 = [j for j in files2 if any(j.startswith(i) for i in mtypes1)]
        file_paths1 = [os.path.join(root, i) for i in files3]
        if len(file_paths1) > 0:
            ds = rd_niwa_rcp_dir(file_paths1, site_loc1, mtypes)
            if callable(output_fun):
                new_base_path = root.replace(base_path, export_path)
                base_file_name = file_paths1[0].split('VCSN_')[1]
                if not os.path.exists(new_base_path):
                    os.makedirs(new_base_path)
                output_fun(ds, new_base_path, base_file_name)
                print(base_file_name)
            else:
                raise ValueError('Must have a output function.')
site_xy1 = gpd.GeoDataFrame(site_xy['ExtSiteID'], geometry=geometry, crs=2193)

gauge_xy = site_xy1[site_xy1.ExtSiteID.isin([site
                                             ])].set_index('ExtSiteID').copy()
rec_xy = site_xy1[site_xy1.ExtSiteID.isin(rec_sites)].set_index(
    'ExtSiteID').copy()

###########################################
### Iterate through the low flow sites

results_dict = {}
for g in gauge_xy.index:
    print(g)

    ## Determine recorder sites within search distance
    near_rec_sites = sel_sites_poly(rec_xy, gauge_xy.loc[[g]], search_dis)
    print('There are ' + str(len(near_rec_sites)) +
          ' recorder sites within range')

    ## Extract all ts data
    g_data = mssql.rd_sql(server,
                          database,
                          ts_daily_table, ['ExtSiteID', 'DateTime', 'Value'],
                          where_in={
                              'DatasetTypeID': man_datasets,
                              'ExtSiteID': [g],
                              'QualityCode': qual_codes
                          },
                          from_date=min_date,
                          date_col='DateTime')
    g_data.DateTime = pd.to_datetime(g_data.DateTime)
rec_xy = site_xy1[site_xy1.ExtSiteID.isin(
    rec_sites.ExtSiteID)].set_index('ExtSiteID').copy()

###########################################
### Iterate through the low flow sites

results_dict = {}
for g in man_sites2.ExtSiteID.tolist():
    print(g)

    ## Max trig
    #    max_trig = man_sites2[man_sites2.ExtSiteID == g].max_trig.tolist()[0]

    ## Determine recorder sites within search distance
    man_loc = man_xy.loc[[g.strip()]].buffer(search_dis)
    near_rec_sites = sel_sites_poly(rec_xy, man_loc)
    print('There are ' + str(len(near_rec_sites)) +
          ' recorder sites within range')

    if not near_rec_sites.empty:
        ## Extract all ts data
        g_data = mssql.rd_sql(server,
                              database,
                              ts_daily_table,
                              ['ExtSiteID', 'DateTime', 'Value'],
                              where_col={
                                  'DatasetTypeID': man_datasets,
                                  'ExtSiteID': [g],
                                  'QualityCode': qual_codes
                              },
                              from_date=min_date,
Exemple #4
0
rec_streams_shp_path = get_path(rec_streams_shp)
rec_catch_shp_path = get_path(rec_catch_shp)
catch_shp_path = get_path(catch_shp)

sites_col_name = 'SITENUMBER'
poly_col_name = 'Catchmen_1'
line_site_col = 'NZREACH'

#######################################
### Examples

pts = util.load_geo_data(sites_shp_path)
pts['geometry'] = pts.geometry.simplify(1)

## Selecting points from within a polygon
pts1 = vector.sel_sites_poly(sites_shp_path, rec_catch_shp_path, buffer_dis=10)

## Spatial join attributes of polygons to points
pts2, poly2 = vector.pts_poly_join(sites_shp_path, catch_shp_path, poly_col_name)

## Create a GeoDataFrame from x and y data
pts_df = pts[[sites_col_name, 'geometry']].copy()
pts_df['x'] = pts_df.geometry.x
pts_df['y'] = pts_df.geometry.y
pts_df.drop('geometry', axis=1, inplace=True)

pts3 = vector.xy_to_gpd(sites_col_name, 'x', 'y', pts_df)

## Find the closest line to points
line1 = vector.closest_line_to_pts(sites_shp_path, rec_streams_shp_path, line_site_col, buffer_dis=100)
###########################################
### Iterate through the low flow sites

if isinstance(input_sites, list):
    man_sites2 = man_sites1[man_sites1.ExtSiteID.isin(input_sites)]
else:
    man_sites2 = man_sites1

results_dict = {}
for g in man_sites1.ExtSiteID.tolist():
    print(g)

    man_loc = site_xy1.loc[[g.strip()], :]
    man_loc['geometry'] = man_loc.buffer(search_dis)
    near_rec_sites = sel_sites_poly(site_xy1, man_loc)
    near_rec_sites = near_rec_sites.loc[near_rec_sites.index != g]
    print('There are ' + str(len(near_rec_sites)) + ' sites within range')

    if not near_rec_sites.empty:
        ## Extract all ts data
        g_data = mssql.rd_sql(server,
                              database,
                              ts_daily_table,
                              ['ExtSiteID', 'DateTime', 'Value'],
                              where_in={
                                  'DatasetTypeID': man_datasets,
                                  'ExtSiteID': [g],
                                  'QualityCode': qual_codes
                              },
                              from_date=min_date,
Exemple #6
0
def getCorrelations(self):
    '''
    Calculate correlations (fits) for flow sites that do not have a recorder.
    Returns:
        - dataframe with the 6 best correlations for each flow site
        - dataframe with flow time-series for all sites (recorder and manual gaugings)
        - geopandas dataframe with locations of recorder sites within a buffer distance from the manual sites
    Writes:
        - csv-file with the 6 best correlations for each flow site
        - csv-file with flow time-series for all sites (recorder and manual gaugings)
        - shapefile with locations of recorder sites within a buffer distance from the manual sites
    '''
    
    dataset_type_table = 'DatasetType'
    ts_summ_table = 'TSDataNumericDailySumm'
    ts_table = 'TSDataNumericDaily'
    site_table = 'ExternalSite'
    
    self.min_flow_obs = self.config.getint('FLOW_CORRELATIONS', 'min_flow_obs')
    self.buf_dist = self.config.getint('FLOW_CORRELATIONS', 'buf_dist')
    self.filter_winter_flow = self.config.getint('FLOW_CORRELATIONS', 'filter_winter_flow')
    
    #-lists of stations and corresponding dates for which flow records should be removed before correlation is calculated
    remove_stat_dates = self.config.get('FLOW_CORRELATIONS', 'remove_stat_dates').split(',')
    if len(remove_stat_dates)>1:
        remove_stats = remove_stat_dates[0:-1:2]
        remove_dates = remove_stat_dates[1::2]
    else:
        remove_stats = False
        remove_dates = False
    remove_stat_dates = None

    #-Get lowflow sites
    flow_sites = self.flow_sites_gdf.flow_site.tolist()
 
    ## Read in datasettypes
    datasets = mssql.rd_sql(self.server, self.database, dataset_type_table, ['DatasetTypeID', 'CTypeID'], where_in={'FeatureID': [1], 'MTypeID': [2], 'CTypeID': [1, 2], 'DataCodeID': [1]})
     
    #-Get datasetTypes for recorded data and manual data
    rec_datasetTypes = datasets[datasets.CTypeID == 1].DatasetTypeID.tolist()
    man_datasetTypes = datasets[datasets.CTypeID == 2].DatasetTypeID.tolist()
    all_datasetTypes = rec_datasetTypes.copy()
    all_datasetTypes.extend(man_datasetTypes)
     
    #-Get summary table for the lowflow sites
    site_summ = mssql.rd_sql(self.server, self.database, ts_summ_table, where_in={'DatasetTypeID': all_datasetTypes, 'ExtSiteID': flow_sites})
    site_summ.FromDate = pd.to_datetime(site_summ.FromDate)
    site_summ.ToDate = pd.to_datetime(site_summ.ToDate)
    site_summ.drop('ModDate', axis=1, inplace=True)
     
    #-Throw out sites from summary table and lowflow sites that have less records than 'min_flow_obs'
    too_short = site_summ.loc[site_summ.Count<self.min_flow_obs, ['ExtSiteID','Count']]
    if len(too_short)>0:
        for j in too_short.iterrows():
            print('ExtSiteID %s is not used because it only has %s records, which is less than %s.' %(j[1]['ExtSiteID'], int(j[1]['Count']), self.min_flow_obs))
            site_summ = site_summ.loc[site_summ['ExtSiteID']!=j[1]['ExtSiteID']]
            flow_sites.remove(j[1]['ExtSiteID'])
     
    #-Get site ids for recorder and manual sites of the lowflow sites and create geodataframe for manually recorded lowflow sites
    rec_sites = site_summ.loc[site_summ.DatasetTypeID.isin(rec_datasetTypes),'ExtSiteID'].tolist()
    man_sites = site_summ.loc[site_summ.DatasetTypeID.isin(man_datasetTypes),'ExtSiteID'].tolist()
    man_sites_gdf = self.flow_sites_gdf[self.flow_sites_gdf.flow_site.isin(man_sites)].copy()
     
    #-get all recorder flow sites within a buffer distance from the manually recorded lowflow sites
    man_sites_buffer_gdf = man_sites_gdf.copy()
    man_sites_buffer_gdf['geometry'] = man_sites_gdf.buffer(self.buf_dist)
    all_recorder_flowsites = mssql.rd_sql(self.server, self.database, ts_summ_table, ['ExtSiteID'], where_in={'DatasetTypeID': rec_datasetTypes})
    all_recorder_flowsites = pd.unique(all_recorder_flowsites.ExtSiteID).tolist()
    all_recorder_flowsites =  mssql.rd_sql(self.server, self.database, site_table, ['ExtSiteID', 'NZTMX', 'NZTMY'], where_in={'ExtSiteID': all_recorder_flowsites})
    all_recorder_flowsites_gdf = vector.xy_to_gpd('ExtSiteID', 'NZTMX', 'NZTMY', all_recorder_flowsites)
    self.rec_sites_buffer_gdf = vector.sel_sites_poly(all_recorder_flowsites_gdf, man_sites_buffer_gdf)
    #-write recorder sites within buffer to a shapefile
    self.rec_sites_buffer_gdf.to_file(os.path.join(self.results_path, self.config.get('FLOW_CORRELATIONS', 'rec_sites_shp')))
    all_recorder_flowsites = None; all_recorder_flowsites_gdf = None; del all_recorder_flowsites, all_recorder_flowsites_gdf  
     
     
    #-merge list of lowflow sites with list of recorder sites in buffer to create list of all sites for which to extract data
    all_flow_sites = flow_sites.copy()
    all_flow_sites.extend(pd.unique(self.rec_sites_buffer_gdf.ExtSiteID).tolist())
    all_flow_sites = pd.DataFrame(columns=all_flow_sites)
    all_flow_sites = pd.unique(all_flow_sites.columns).tolist()
     
    #-Get time-series of all_flow_sites
    ts_df = mssql.rd_sql(self.server, self.database, ts_table, ['ExtSiteID', 'DatasetTypeID', 'DateTime', 'Value', 'QualityCode'], where_in={'DatasetTypeID': all_datasetTypes, 'ExtSiteID': all_flow_sites})
    ts_df = ts_df.loc[ts_df.QualityCode>100] #-everything otherwise than missing data
    ts_df.DateTime = pd.to_datetime(ts_df.DateTime)
    ts_df_sites = pd.unique(ts_df.ExtSiteID).tolist()
    all_flow_sites = None; del all_flow_sites
     
    #-Check if there is data for all lowflow sites
    for j in flow_sites:
        if j not in ts_df_sites:
            print('There is zero flow data for site %s, or record of observations for this site was too short.' %j) 
    #-Minimum and maximum date of all data
    min_date = ts_df.DateTime.min()
    max_date = ts_df.DateTime.max()
     
    #-Fill dataframe with data for min_date through max_date
    df_final = pd.DataFrame(index=pd.date_range(min_date, max_date, freq='D'), columns=ts_df_sites)
    df_final.rename_axis('DateTime', inplace=True)
    for j in ts_df_sites:
        df_short = ts_df.loc[ts_df.ExtSiteID==j, ['DateTime', 'Value','QualityCode']]
        #-keep only the records with the highest qualitycode
        df_short_group = df_short.groupby(['DateTime', 'QualityCode']).mean()
        df_short_group.reset_index(inplace=True)
        df_short_group.sort_values(by=['DateTime', 'QualityCode'], inplace=True)
        df_short_group.drop_duplicates(subset='DateTime', keep='last', inplace=True)
        df_short_group.drop('QualityCode', inplace=True, axis=1)
        df_short_group.set_index('DateTime', inplace=True)
        df_short_group.rename(columns={'Value':j}, inplace=True)
        df_final[[j]] = df_short_group
    ts_df = None; df_short_group = None; del ts_df, df_short_group
    df_final.dropna(how='all', inplace=True)
     
    #-create a copy of all gauged and recorded flow (before filtering) to make it accessible in the main class
    self.flow_ts_df = df_final.copy()
    self.flow_ts_df.to_csv(os.path.join(self.results_path, self.config.get('FLOW_CORRELATIONS', 'flow_ts_csv')))
     
    #-remove zero records
    df_final[df_final==0] = np.nan
    
    #-remove unreliable values using lists of stations and dates
    if remove_stats and remove_dates:
        df_final.reset_index(inplace=True)
        i = 0
        for s in remove_stats:
            stat_date = remove_dates[i]
            df_final.loc[(df_final['DateTime']==stat_date), s] = np.nan
            i+=1
        df_final.set_index('DateTime', inplace=True)
     
    #-keep only winter flows for estimating correlations
    if self.filter_winter_flow:
        df_final.reset_index(inplace=True)
        df_final['Month'] = df_final['DateTime'].dt.strftime('%m').astype(np.int)
        df_final.set_index('DateTime', inplace=True)
        df_final = df_final.loc[(df_final.Month>4) & (df_final.Month<10)]
    df_final.dropna(how='all', inplace=True)
     
    #-loop over the lowflow sites and calculate regressions
    df_regression = pd.DataFrame(columns=['y', 'x', 'mean_y', 'mean_x', 'nobs', 'rmse', 'Adj. R2-squared', 'p-value', 'slope', 'intercept', 'power', 'fittype'])
    i=0
    for j in flow_sites:
        for s in self.rec_sites_buffer_gdf.ExtSiteID.tolist():
            sel_df = df_final[[j, s]]
            sel_df.replace(0., np.nan, inplace=True)
            sel_df.dropna(inplace=True)
            #-only fit for sites that have the minimum number of observations
            if len(sel_df)>=self.min_flow_obs:
                try: #-linear regression
                    #-set x and y
                    x = sel_df[[s]]
                    x = sm.add_constant(x)  #-needed for intercept
                    y = sel_df[[j]]
                       
                    #-linear fit
                    model = sm.OLS(y, x).fit()
                       
                    #-predict using the model
                    predictions = model.predict(x) # make the predictions by the model
                    rmse_val = np.sqrt(  np.mean(   (predictions.to_numpy() - y.to_numpy())**2   )  )
                       
                    #-fill dataframe with stats for linear fit
                    df_regression.loc[i, 'y'] = j
                    df_regression.loc[i, 'x'] = s
                    df_regression.loc[i, 'mean_y'] = np.nanmean(y)
                    df_regression.loc[i, 'mean_x'] = np.nanmean(x[s].to_numpy())
                    df_regression.loc[i, 'nobs'] = model.nobs
                    df_regression.loc[i, 'rmse'] = rmse_val
                    df_regression.loc[i, 'Adj. R2-squared'] = model.rsquared_adj
                    df_regression.loc[i, 'p-value'] = model.f_pvalue
                    df_regression.loc[i, 'slope'] = model.params[1]
                    df_regression.loc[i, 'intercept'] = model.params[0]
                    df_regression.loc[i, 'fittype'] = 'linear'
                       
                    i+=1
                except:
                    print('Could not establish linear fit for %s and %s...' %(j,s))
      
                try: #-power fit regression
                    #-set x and y
                    x = sel_df[[s]]
                    x = sm.add_constant(x)  #-needed for intercept
                    y = sel_df[[j]]                 
      
                    #-***********************************************************
                    #-                           y = ax^b
                    #-                           log(y) = log(a) + b*log(x)
                    #-***********************************************************
                    logY = np.log(y)
                    logX = np.log(x[[s]])
                    logX = sm.add_constant(logX)
                    model = sm.OLS(logY, logX).fit()
       
                    x = x[s].to_numpy()
                    #-predictions of the exact x values for calculating rmse            
                    predictions = np.exp(model.params[0]) * x**model.params[1]
                    rmse_val = np.sqrt(  np.mean(   (predictions - y.to_numpy())**2   )  )
                       
                    #-fill dataframe with stats for power fit
                    df_regression.loc[i, 'y'] = j
                    df_regression.loc[i, 'x'] = s
                    df_regression.loc[i, 'mean_y'] = np.nanmean(y)
                    df_regression.loc[i, 'mean_x'] = np.nanmean(x)
                    df_regression.loc[i, 'nobs'] = model.nobs
                    df_regression.loc[i, 'rmse'] = rmse_val
                    df_regression.loc[i, 'Adj. R2-squared'] = model.rsquared_adj
                    df_regression.loc[i, 'p-value'] = model.f_pvalue
                    df_regression.loc[i, 'slope'] = np.exp(model.params[0])
                    df_regression.loc[i, 'power'] = model.params[1]
                    df_regression.loc[i, 'fittype'] = 'power'
                    i+=1
                except:
                    print('Could not establish power fit for %s and %s...' %(j,s))
    sel_df = None; del sel_df
    #-remove negative correlations
    df_regression.loc[df_regression['Adj. R2-squared']<0.,:] = np.nan
    df_regression.dropna(how='all', inplace=True)
      
    #-loop over lowflow sites, and select best six fits by sorting on R2, rmse, nobs
    self.best_regressions_df = pd.DataFrame(columns=df_regression.columns)
    for j in flow_sites:
        sel_df = df_regression.loc[df_regression.y == j]
        sel_df.sort_values(by=['Adj. R2-squared', 'rmse', 'nobs'], ascending=[False, True, False], inplace=True)
        sel_df = sel_df.iloc[0:6,:]
        self.best_regressions_df = pd.concat([self.best_regressions_df, sel_df])
    sel_df = None; del sel_df
    self.best_regressions_df.to_csv(os.path.join(self.results_path, self.config.get('FLOW_CORRELATIONS', 'correlations_csv')), index=False)
     
    #-make 6 plots for each site to visually pick the best
    unique_y = pd.unique(self.best_regressions_df['y'])
    unique_y = list(unique_y)
    for j in flow_sites:
        if j in unique_y:
            #-get regression results for site j
            sel_df_regression = self.best_regressions_df.loc[self.best_regressions_df.y == j]
     
            #-plot the best 6 fits
            fig = plt.figure(figsize=(11,8))
            kk=1
            for k in sel_df_regression.iterrows():
                try:
                    ax1 = plt.subplot(3,2,kk)
                    xSite = k[1]['x']
                    ySite = k[1]['y']
                     
                    xy = df_final[[xSite, ySite]]
                    xy.dropna(inplace=True)
                    x = xy[xSite].to_numpy()
                    y = xy[ySite].to_numpy()
         
                    xmin = np.nanmin(x)
                    xmax = np.nanmax(x)
                    dx = (xmax-xmin)/100
                    xvals = np.arange(xmin, xmax, dx)
                    ymin = np.nanmin(y)
                    ymax = np.nanmax(y)
                     
                    #-Get the stats from the table
                    fit_type = k[1]['fittype']
                    if fit_type == 'linear':
                        eq_str = 'y = %.3fx + %.3f' %(k[1]['slope'], k[1]['intercept'])
                        predictions = k[1]['slope'] * xvals + k[1]['intercept']
                    else:
                        eq_str = 'y = %.3fx$^{%.3f}$' %(k[1]['slope'], k[1]['power'])
                        predictions = k[1]['slope'] * (xvals ** k[1]['power'])
                    r2 = k[1]['Adj. R2-squared']
                    rmse_val = k[1]['rmse']
                    p_val = k[1]['p-value']
                    nobs = k[1]['nobs']
                    plt.plot(xvals, predictions, label='fit')
                    plt.scatter(x,y, color='red', label='data')
                    ax1.grid(True)
                    ax1.set_xlabel('Flow at %s [m$^3$ s$^{-1}$]' %xSite)
                    ax1.set_ylabel('Flow at %s [m$^3$ s$^{-1}$]' %ySite)
                    ax1.legend(loc='lower right')
                     
                    dy = (ymax-ymin)/10
                    dx = (xmax-xmin)/10
                     
                    #-plot the stats in the plot
                    plt.text(xmin, ymax, eq_str)
                    plt.text(xmin, ymax-dy, 'R$^2$ %.2f' %r2)
                    plt.text(xmin, ymax-2*dy, 'RMSE %.2f' %rmse_val)
                    plt.text(xmin, ymax-3*dy, 'p-value %.2f' %p_val)
                    plt.text(xmin, ymax-4*dy, 'nobs %.0f' %nobs)
                    ax1.set_xlim([xmin-dx, xmax+dx])
                    ax1.set_ylim([ymin-dy, ymax+dy])
                    kk+=1
                except:
                    pass
            fig.tight_layout()
            if self.filter_winter_flow:
                plt.savefig(os.path.join(self.results_path, '%s_correlations_winteronly.png' %ySite), dpi=300)
            else:
                plt.savefig(os.path.join(self.results_path, '%s_correlations.png' %ySite), dpi=300)
Exemple #7
0
sites_buffer_gdf = flow_sites_gdf.copy()
sites_buffer_gdf['geometry'] = sites_buffer_gdf.buffer(60000)

all_gwl_sites = mssql.rd_sql(
    server,
    database,
    ts_summ_table, ['ExtSiteID'],
    where_in={'DatasetTypeID': datasets.DatasetTypeID.tolist()})
all_gwl_sites = pd.unique(all_gwl_sites.ExtSiteID).tolist()
all_gwl_sites = mssql.rd_sql(server,
                             database,
                             site_table, ['ExtSiteID', 'NZTMX', 'NZTMY'],
                             where_in={'ExtSiteID': all_gwl_sites})
all_gwl_sites_gdf = vector.xy_to_gpd('ExtSiteID', 'NZTMX', 'NZTMY',
                                     all_gwl_sites)
sel_gwl_sites_gdf = vector.sel_sites_poly(all_gwl_sites_gdf, sites_buffer_gdf)
all_gwl_sites = None
all_gwl_sites_gdf = None

#-Get summary table for the gwl sites and filter period of interest
site_summ = mssql.rd_sql(server,
                         database,
                         ts_summ_table,
                         where_in={
                             'DatasetTypeID': datasets.DatasetTypeID.tolist(),
                             'ExtSiteID': sel_gwl_sites_gdf.ExtSiteID.tolist()
                         })
site_summ.FromDate = pd.to_datetime(site_summ.FromDate)
site_summ.ToDate = pd.to_datetime(site_summ.ToDate)
site_summ.drop('ModDate', axis=1, inplace=True)
site_summ = site_summ.loc[(site_summ.FromDate < to_date)
Exemple #8
0
def rd_niwa_vcsn(
        mtypes,
        sites,
        nc_path=r'\\fileservices02\ManagedShares\Data\VirtualClimate\vcsn_precip_et_2016-06-06.nc',
        vcsn_sites_csv=r'\\fileservices02\ManagedShares\Data\VirtualClimate\GIS\niwa_vcsn_wgs84.csv',
        id_col='Network',
        x_col='deg_x',
        y_col='deg_y',
        buffer_dis=0,
        include_sites=False,
        from_date=None,
        to_date=None,
        out_crs=None,
        netcdf_out=None):
    """
    Function to read in the NIWA vcsn netcdf file and output the data as a dataframe.

    mtypes -- A string or list of the measurement types (either 'precip', or 'PET').\n
    sites -- Either a list of vcsn site names or a polygon of the area of interest.\n
    nc_path -- The path to the vcsn nc file.\n
    vcsn_sites_csv -- The csv file that relates the site name to coordinates.\n
    id_col -- The site name column in vcsn_sites_csv.\n
    x_col - The x column name in vcsn_sites_csv.\n
    y_col -- The y column name in vcsn_sites_csv.\n
    include_sites -- Should the site names be added to the output?\n
    out_crs -- The crs epsg number for the output coordinates if different than the default WGS85 (e.g. 2193 for NZTM).
    """

    mtype_name = {'precip': 'rain', 'PET': 'pe'}

    ### Import and reorganize data
    vcsn_sites = pd.read_csv(vcsn_sites_csv)[[id_col, x_col, y_col]]

    if isinstance(sites, str):
        if sites.endswith('.shp'):
            sites_gpd = vector.xy_to_gpd(id_col, x_col, y_col, vcsn_sites,
                                         4326)
            poly1 = gpd.read_file(sites)

            sites_gpd2 = sites_gpd.to_crs(poly1.crs)

            ### Select sites
            sites2 = vector.sel_sites_poly(sites_gpd2, poly1,
                                           buffer_dis)[id_col]
    elif isinstance(sites, (list, pd.Series, np.ndarray)):
        sites2 = sites

    ### Select locations
    site_loc1 = vcsn_sites[vcsn_sites[id_col].isin(sites2)]
    site_loc1.columns = ['id', 'x', 'y']

    ### Select mtypes
    if isinstance(mtypes, str):
        mtypes1 = [mtype_name[mtypes]]
    else:
        mtypes1 = [mtype_name[i] for i in mtypes]

    if include_sites:
        mtypes1.extend(['site'])

    ### Read and extract data from netcdf files
    ds1 = xr.open_dataset(nc_path)
    time1 = pd.to_datetime(ds1.time.values)
    if isinstance(from_date, str):
        time1 = time1[time1 >= from_date]
    if isinstance(to_date, str):
        time1 = time1[time1 <= to_date]
    lat1 = ds1.latitude.values
    lon1 = ds1.longitude.values
    lat2 = lat1[np.in1d(lat1, site_loc1.y.unique())]
    lon2 = lon1[np.in1d(lon1, site_loc1.x.unique())]
    ds2 = ds1.loc[{'longitude': lon2, 'time': time1.values, 'latitude': lat2}]
    ds3 = ds2[mtypes1]

    ### Convert to DataFrame
    df1 = ds3.to_dataframe().reset_index()
    df1.rename(columns={'latitude': 'y', 'longitude': 'x'}, inplace=True)
    df1 = df1.dropna()

    ### Convert to different crs if needed
    if out_crs is not None:
        crs1 = util.convert_crs(out_crs)
        new_gpd1 = vector.xy_to_gpd('id', 'x', 'y', site_loc1, 4326)
        new_gpd2 = new_gpd1.to_crs(crs1)
        site_loc2 = site_loc1.copy()
        site_loc2['x_new'] = new_gpd2.geometry.apply(lambda j: j.x)
        site_loc2['y_new'] = new_gpd2.geometry.apply(lambda j: j.y)

        df2 = pd.merge(df1,
                       site_loc2[['x', 'y', 'x_new', 'y_new']],
                       on=['x', 'y'])
        df3 = df2.drop(['x', 'y'], axis=1).rename(columns={
            'x_new': 'x',
            'y_new': 'y'
        })
        col_order = ['y', 'x', 'time']
        col_order.extend(mtypes1)
        df4 = df3[col_order]
    else:
        df4 = df1

    ds1.close()
    ds3.close()

    ### Return
    if isinstance(netcdf_out, str):
        ds3.to_netcdf(netcdf_out)
    return df4
usage_output = 'pareora_huts_usage_mon.csv'

############################################
### Extract data

site_filter = {'CatchmentGroupName': catch_group}

a1 = AlloUsage(from_date, to_date, site_filter=site_filter)

sites0 = a1.sites.reset_index().copy()

sites = vector.xy_to_gpd('wap', 'NZTMX', 'NZTMY', sites0)

catch_del = gpd.read_file(catch_del_shp_path)
catch_del.rename(columns={'SITENUMBER': 'site'}, inplace=True)

catch_del1 = catch_del[catch_del.site == rec_site]

sites1 = vector.sel_sites_poly(sites, catch_del1)

## Usage data

usage1 = pd.read_hdf(os.path.join(source_path, usage_hdf))

usage2 = usage1[usage1.wap.isin(sites1.wap)].copy()
usage2['time'] = pd.to_datetime(usage2['time'])

usage2.to_csv(os.path.join(base_path, shp_dir, usage_output), index=False)

usage3 = usage2.groupby('time')['sd_usage'].sum()
rec_rivers_dict = sql1.get_dict(rec_rivers_sql)
rec_catch_dict = sql1.get_dict(rec_catch_sql)

###################################
### Catchment delineation and WAPs

catch1 = rec.catch_delineate(sites2, rec_rivers_dict, rec_catch_dict)
catch1.to_file(os.path.join(results_path, catch_del_shp))

wap1 = mssql.rd_sql(server, database, crc_wap_table, ['wap']).wap.unique()

sites3 = sites1[sites1.ExtSiteID.isin(wap1)].copy()

sites4 = vector.xy_to_gpd('ExtSiteID', 'NZTMX', 'NZTMY', sites3)

sites5 = vector.sel_sites_poly(sites4, catch1)
sites5.to_file(os.path.join(results_path, wap_shp))

##################################
### Get crc data

allo1 = AlloUsage(crc_wap_filter={'wap': sites5.ExtSiteID.tolist()},
                  from_date=from_date,
                  to_date=to_date)

#allo1.allo[allo1.allo.crc_status == 'Terminated - Replaced']

allo1.allo_wap.to_csv(os.path.join(results_path, allo_wap_csv))
allo1.allo.to_csv(os.path.join(results_path, allo_csv))
Exemple #11
0
        'Catchment', 'DateTime of forecast', 'MetService product',
        'Catchment precipitation [mm]', 'Month'
    ])
    df_catchment_avg_final = pd.concat([
        df_catchment_avg_final,
        pd.DataFrame(columns=[i for i in range(1, 85 + 1)])
    ],
                                       axis=1)

    ###-now per catchment
    for catch in unique_catchments:
        print('Processing %s' % catch)
        #-Select only the geopandas dataframe of the catchment of interest
        sel_catch = catchment_gdf.loc[catchment_gdf['CATCH_NAME'] == catch]
        #-Select the stations that can be found within this catchment
        sel_stats = vector.sel_sites_poly(stations_gdf, sel_catch)
        #-Select only the precipitation data for the stations part of that catchment
        df_catchment = df.loc[df.ExtSiteID.isin(sel_stats.ExtSiteID.tolist())]
        df_catchment.insert(0, 'Catchment', catch)
        #-Create a copy of the catchment df that contains the stations to be used for averaging
        df_catchment_avg = df_catchment.copy()
        df_catchment_avg.drop('ExtSiteID', axis=1, inplace=True)
        #-Average precipitation station and forecast values
        df_catchment_avg = df_catchment_avg.groupby(
            'DateTime of forecast').mean()
        df_catchment_avg.reset_index(inplace=True)
        df_catchment_avg.insert(1, 'MetService product', fprod)
        df_catchment_avg.insert(0, 'Catchment', catch)

        #-Add columns for calculating percentual difference for individual stations
        df_catchment = pd.concat([df_catchment, df_proc], axis=1)
    def flow_est(self, buffer_dis=50000):
        """
        Function to query and/or estimate flow at the input_sites.

        Parameters
        ----------
        buffer_dis : int
            The search radius for the regressions in meters.

        Returns
        -------
        DataFrame of Flow
        """

        ### Read data if it exists

        if self.input_summ.CollectionType.isin(['Recorder']).any():
            rec_summ1 = self.input_summ[self.input_summ.CollectionType.isin(['Recorder'])].copy()
            rec_ts_data1 = mssql.rd_sql_ts(self.ts_server, param['input']['ts_database'], param['input']['ts_table'], ['ExtSiteID', 'DatasetTypeID'], 'DateTime', 'Value', from_date=self.from_date, to_date=self.to_date, where_in={'ExtSiteID': rec_summ1.ExtSiteID.tolist(), 'DatasetTypeID': rec_summ1.DatasetTypeID.unique().tolist()}).reset_index()
            rec_ts_data1 = pd.merge(rec_summ1[['ExtSiteID', 'DatasetTypeID']], rec_ts_data1, on=['ExtSiteID', 'DatasetTypeID']).drop('DatasetTypeID', axis=1).set_index(['ExtSiteID', 'DateTime'])
            rec_ts_data2 = rec_ts_data1.Value.unstack(0)

        else:
            rec_ts_data2 = pd.DataFrame()

        ### Run correlations if necessary

        if self.input_summ.CollectionType.isin(['Manual Field']).any():
            man_summ1 = self.input_summ[self.input_summ.CollectionType.isin(['Manual Field'])].copy()
            man_sites1 = self.sites_gdf[self.sites_gdf.ExtSiteID.isin(man_summ1.ExtSiteID)].copy()

            ## Determine which sites are within the buffer of the manual sites

            buff_sites_dict = {}
            man_buff1 = man_sites1.set_index(['ExtSiteID']).copy()
            man_buff1['geometry'] = man_buff1.buffer(buffer_dis)

            rec_sites_gdf = self.sites_gdf[self.sites_gdf.CollectionType == 'Recorder'].copy()

            for index in man_buff1.index:
                buff_sites1 = vector.sel_sites_poly(rec_sites_gdf, man_buff1.loc[[index]])
                buff_sites_dict[index] = buff_sites1.ExtSiteID.tolist()

            buff_sites_list = [item for sublist in buff_sites_dict.values() for item in sublist]
            buff_sites = set(buff_sites_list)

            ## Pull out recorder data needed for all manual sites
            man_ts_data1 = mssql.rd_sql_ts(self.ts_server, param['input']['ts_database'], param['input']['ts_table'], ['ExtSiteID', 'DatasetTypeID'], 'DateTime', 'Value', from_date=self.from_date, to_date=self.to_date, where_in={'ExtSiteID': man_summ1.ExtSiteID.tolist(), 'DatasetTypeID': man_summ1.DatasetTypeID.unique().tolist()}).reset_index()
            man_ts_data1 = pd.merge(man_summ1[['ExtSiteID', 'DatasetTypeID']], man_ts_data1, on=['ExtSiteID', 'DatasetTypeID']).drop('DatasetTypeID', axis=1).set_index(['ExtSiteID', 'DateTime'])
            man_ts_data2 = man_ts_data1.Value.unstack(0)

            man_rec_summ1 = self.summ[self.summ.ExtSiteID.isin(buff_sites)].copy()
            man_rec_ts_data1 = mssql.rd_sql_ts(self.ts_server, param['input']['ts_database'], param['input']['ts_table'], ['ExtSiteID', 'DatasetTypeID'], 'DateTime', 'Value', from_date=self.from_date, to_date=self.to_date, where_in={'ExtSiteID': man_rec_summ1.ExtSiteID.tolist(), 'DatasetTypeID': man_rec_summ1.DatasetTypeID.unique().tolist()}).reset_index()
            man_rec_ts_data1 = pd.merge(man_rec_summ1[['ExtSiteID', 'DatasetTypeID']], man_rec_ts_data1, on=['ExtSiteID', 'DatasetTypeID']).drop('DatasetTypeID', axis=1).set_index(['ExtSiteID', 'DateTime'])
            man_rec_ts_data2 = man_rec_ts_data1.Value.unstack(0).interpolate('time', limit=10)

            ## Run through regressions
            reg_lst = []
            new_lst = []

            for key, lst in buff_sites_dict.items():
                man_rec_ts_data3 = man_rec_ts_data2.loc[:, lst].copy()
                man_rec_ts_data3[man_rec_ts_data3 <= 0] = np.nan

                man_ts_data3 = man_ts_data2.loc[:, [key]].copy()
                man_ts_data3[man_ts_data3 <= 0] = np.nan

                lm1 = LM(man_rec_ts_data3, man_ts_data3)
                res1 = lm1.predict(n_ind=1, x_transform='log', y_transform='log', min_obs=self.min_gaugings)
                res1_f = res1.summary_df['f value'].iloc[0]
                if res1 is None:
                    continue

                res2 = lm1.predict(n_ind=2, x_transform='log', y_transform='log', min_obs=self.min_gaugings)
                if res2 is not None:
                    res2_f = res2.summary_df['f value'].iloc[0]
                else:
                    res2_f = 0

                f = [res1_f, res2_f]

                val = f.index(max(f))

                if val == 0:
                    reg_lst.append(res1.summary_df)

                    s1 = res1.summary_df.iloc[0]

                    d1 = man_rec_ts_data3[s1['x sites']].copy()
                    d1[d1 <= 0] = 0.001

                    new_data1 = np.exp(np.log(d1) * float(s1['x slopes']) + float(s1['y intercept']))
                    new_data1.name = key
                    new_data1[new_data1 <= 0] = 0
                else:
                    reg_lst.append(res2.summary_df)

                    s1 = res2.summary_df.iloc[0]
                    x_sites = s1['x sites'].split(', ')
                    x_slopes = [float(s) for s in s1['x slopes'].split(', ')]
                    intercept = float(s1['y intercept'])

                    d1 = man_rec_ts_data3[x_sites[0]].copy()
                    d1[d1 <= 0] = 0.001
                    d2 = man_rec_ts_data3[x_sites[1]].copy()
                    d2[d2 <= 0] = 0.001

                    new_data1 = np.exp((np.log(d1) * float(x_slopes[0])) + (np.log(d2) * float(x_slopes[1])) + intercept)
                    new_data1.name = key
                    new_data1[new_data1 <= 0] = 0

                new_lst.append(new_data1)

            new_data2 = pd.concat(new_lst, axis=1)
            reg_df = pd.concat(reg_lst).reset_index()
        else:
            new_data2 = pd.DataFrame()
            reg_df = pd.DataFrame()

        flow = pd.concat([rec_ts_data2, new_data2], axis=1).round(3)

        ## Save if required
        if hasattr(self, 'output_path'):
            run_time = pd.Timestamp.today().strftime('%Y-%m-%dT%H%M')

            if not reg_df.empty:
                reg_flow_csv = param['output']['reg_flow_csv'].format(run_date=run_time)
                reg_df.to_csv(os.path.join(self.output_path, reg_flow_csv), index=False)

            flow_csv = param['output']['flow_csv'].format(run_date=run_time)
            flow.to_csv(os.path.join(self.output_path, flow_csv))

        setattr(self, 'flow', flow)
        setattr(self, 'reg_flow', reg_df)
        return flow
Exemple #13
0
def test_sel_sites_poly():
    pts1 = vector.sel_sites_poly(sites_shp_path,
                                 rec_catch_shp_path,
                                 buffer_dis=10)

    assert (len(pts1) == 2) & isinstance(pts1, gpd.GeoDataFrame)
    sites_rec_bool = flow_sites_gdf.FlowSite.isin(rec_summ1.ExtSiteID.unique())

    sites_rec1 = flow_sites_gdf[sites_rec_bool].copy()
    sites_man1 = flow_sites_gdf[~sites_rec_bool].copy()

    flow_rec_sites1 = mssql.rd_sql(server, database, site_table, ['ExtSiteID', 'NZTMX', 'NZTMY'], where_in={'ExtSiteID': rec_summ1.ExtSiteID.unique().tolist()})

    flow_rec_sites2 = vector.xy_to_gpd('ExtSiteID', 'NZTMX', 'NZTMY', flow_rec_sites1)

    ## Estimate flow where recorder doesn't exist

    sites_man2 = sites_man1.copy()
    sites_man2['geometry'] = sites_man1.buffer(buffer_dis)

    rec_sites2 = vector.sel_sites_poly(flow_rec_sites2, sites_man2)

    rec_ts_data1 = mssql.rd_sql_ts(server, database, ts_table, 'ExtSiteID', 'DateTime', 'Value', from_date=param['from_date'], to_date=param['to_date'], where_in={'ExtSiteID': rec_sites2.ExtSiteID.tolist(), 'DatasetTypeID': rec_summ1.DatasetTypeID.unique().tolist()})

    rec_ts_data2 = rec_ts_data1.Value.unstack(0).interpolate('time', limit=10).dropna(axis=1)

    rec_flow1 = rec_ts_data2.loc[:, rec_ts_data2.columns.isin(sites_rec1.FlowSite)].copy()

    man_ts_data1 = mssql.rd_sql_ts(server, database, ts_table, 'ExtSiteID', 'DateTime', 'Value', from_date=param['from_date'], to_date=param['to_date'], where_in={'ExtSiteID': sites_man1.FlowSite.tolist(), 'DatasetTypeID': man_datasets})

    man_ts_data2 = man_ts_data1.Value.unstack(0)

    reg_lst = []
    new_lst = []

    for col in man_ts_data2: