def rd_niwa_rcp( base_path, mtypes, poly, vcsn_sites_csv=r'\\fileservices02\ManagedShares\Data\VirtualClimate\GIS\niwa_vcsn_wgs84.csv', id_col='Network', x_col='deg_x', y_col='deg_y', output_fun=None, export_path='output'): """ Function to read in the NIWA RCP netcdf files and output the data in a specified format. """ mtype_name = { 'precip': 'TotalPrecipCorr', 'T_max': 'MaxTempCorr', 'T_min': 'MinTempCorr', 'P_atmos': 'MSLP', 'PET': 'PE', 'RH_mean': 'RelHum', 'R_s': 'SurfRad', 'U_z': 'WindSpeed' } ### Import and reorganize data vcsn_sites = pd.read_csv(vcsn_sites_csv)[[id_col, x_col, y_col]] sites_gpd = vector.xy_to_gpd(id_col, x_col, y_col, vcsn_sites, 4326) poly1 = gpd.read_file(poly) sites_gpd2 = sites_gpd.to_crs(poly1.crs) mtypes1 = [mtype_name[i] for i in mtypes] ### Select sites sites_gpd3 = vector.sel_sites_poly(sites_gpd2, poly1)[id_col] site_loc1 = vcsn_sites[vcsn_sites[id_col].isin(sites_gpd3)] site_loc1.columns = ['id', 'x', 'y'] ### Read and extract data from netcdf files for root, dirs, files in os.walk(base_path): files2 = [i for i in files if i.endswith('.nc')] files3 = [j for j in files2 if any(j.startswith(i) for i in mtypes1)] file_paths1 = [os.path.join(root, i) for i in files3] if len(file_paths1) > 0: ds = rd_niwa_rcp_dir(file_paths1, site_loc1, mtypes) if callable(output_fun): new_base_path = root.replace(base_path, export_path) base_file_name = file_paths1[0].split('VCSN_')[1] if not os.path.exists(new_base_path): os.makedirs(new_base_path) output_fun(ds, new_base_path, base_file_name) print(base_file_name) else: raise ValueError('Must have a output function.')
site_xy1 = gpd.GeoDataFrame(site_xy['ExtSiteID'], geometry=geometry, crs=2193) gauge_xy = site_xy1[site_xy1.ExtSiteID.isin([site ])].set_index('ExtSiteID').copy() rec_xy = site_xy1[site_xy1.ExtSiteID.isin(rec_sites)].set_index( 'ExtSiteID').copy() ########################################### ### Iterate through the low flow sites results_dict = {} for g in gauge_xy.index: print(g) ## Determine recorder sites within search distance near_rec_sites = sel_sites_poly(rec_xy, gauge_xy.loc[[g]], search_dis) print('There are ' + str(len(near_rec_sites)) + ' recorder sites within range') ## Extract all ts data g_data = mssql.rd_sql(server, database, ts_daily_table, ['ExtSiteID', 'DateTime', 'Value'], where_in={ 'DatasetTypeID': man_datasets, 'ExtSiteID': [g], 'QualityCode': qual_codes }, from_date=min_date, date_col='DateTime') g_data.DateTime = pd.to_datetime(g_data.DateTime)
rec_xy = site_xy1[site_xy1.ExtSiteID.isin( rec_sites.ExtSiteID)].set_index('ExtSiteID').copy() ########################################### ### Iterate through the low flow sites results_dict = {} for g in man_sites2.ExtSiteID.tolist(): print(g) ## Max trig # max_trig = man_sites2[man_sites2.ExtSiteID == g].max_trig.tolist()[0] ## Determine recorder sites within search distance man_loc = man_xy.loc[[g.strip()]].buffer(search_dis) near_rec_sites = sel_sites_poly(rec_xy, man_loc) print('There are ' + str(len(near_rec_sites)) + ' recorder sites within range') if not near_rec_sites.empty: ## Extract all ts data g_data = mssql.rd_sql(server, database, ts_daily_table, ['ExtSiteID', 'DateTime', 'Value'], where_col={ 'DatasetTypeID': man_datasets, 'ExtSiteID': [g], 'QualityCode': qual_codes }, from_date=min_date,
rec_streams_shp_path = get_path(rec_streams_shp) rec_catch_shp_path = get_path(rec_catch_shp) catch_shp_path = get_path(catch_shp) sites_col_name = 'SITENUMBER' poly_col_name = 'Catchmen_1' line_site_col = 'NZREACH' ####################################### ### Examples pts = util.load_geo_data(sites_shp_path) pts['geometry'] = pts.geometry.simplify(1) ## Selecting points from within a polygon pts1 = vector.sel_sites_poly(sites_shp_path, rec_catch_shp_path, buffer_dis=10) ## Spatial join attributes of polygons to points pts2, poly2 = vector.pts_poly_join(sites_shp_path, catch_shp_path, poly_col_name) ## Create a GeoDataFrame from x and y data pts_df = pts[[sites_col_name, 'geometry']].copy() pts_df['x'] = pts_df.geometry.x pts_df['y'] = pts_df.geometry.y pts_df.drop('geometry', axis=1, inplace=True) pts3 = vector.xy_to_gpd(sites_col_name, 'x', 'y', pts_df) ## Find the closest line to points line1 = vector.closest_line_to_pts(sites_shp_path, rec_streams_shp_path, line_site_col, buffer_dis=100)
########################################### ### Iterate through the low flow sites if isinstance(input_sites, list): man_sites2 = man_sites1[man_sites1.ExtSiteID.isin(input_sites)] else: man_sites2 = man_sites1 results_dict = {} for g in man_sites1.ExtSiteID.tolist(): print(g) man_loc = site_xy1.loc[[g.strip()], :] man_loc['geometry'] = man_loc.buffer(search_dis) near_rec_sites = sel_sites_poly(site_xy1, man_loc) near_rec_sites = near_rec_sites.loc[near_rec_sites.index != g] print('There are ' + str(len(near_rec_sites)) + ' sites within range') if not near_rec_sites.empty: ## Extract all ts data g_data = mssql.rd_sql(server, database, ts_daily_table, ['ExtSiteID', 'DateTime', 'Value'], where_in={ 'DatasetTypeID': man_datasets, 'ExtSiteID': [g], 'QualityCode': qual_codes }, from_date=min_date,
def getCorrelations(self): ''' Calculate correlations (fits) for flow sites that do not have a recorder. Returns: - dataframe with the 6 best correlations for each flow site - dataframe with flow time-series for all sites (recorder and manual gaugings) - geopandas dataframe with locations of recorder sites within a buffer distance from the manual sites Writes: - csv-file with the 6 best correlations for each flow site - csv-file with flow time-series for all sites (recorder and manual gaugings) - shapefile with locations of recorder sites within a buffer distance from the manual sites ''' dataset_type_table = 'DatasetType' ts_summ_table = 'TSDataNumericDailySumm' ts_table = 'TSDataNumericDaily' site_table = 'ExternalSite' self.min_flow_obs = self.config.getint('FLOW_CORRELATIONS', 'min_flow_obs') self.buf_dist = self.config.getint('FLOW_CORRELATIONS', 'buf_dist') self.filter_winter_flow = self.config.getint('FLOW_CORRELATIONS', 'filter_winter_flow') #-lists of stations and corresponding dates for which flow records should be removed before correlation is calculated remove_stat_dates = self.config.get('FLOW_CORRELATIONS', 'remove_stat_dates').split(',') if len(remove_stat_dates)>1: remove_stats = remove_stat_dates[0:-1:2] remove_dates = remove_stat_dates[1::2] else: remove_stats = False remove_dates = False remove_stat_dates = None #-Get lowflow sites flow_sites = self.flow_sites_gdf.flow_site.tolist() ## Read in datasettypes datasets = mssql.rd_sql(self.server, self.database, dataset_type_table, ['DatasetTypeID', 'CTypeID'], where_in={'FeatureID': [1], 'MTypeID': [2], 'CTypeID': [1, 2], 'DataCodeID': [1]}) #-Get datasetTypes for recorded data and manual data rec_datasetTypes = datasets[datasets.CTypeID == 1].DatasetTypeID.tolist() man_datasetTypes = datasets[datasets.CTypeID == 2].DatasetTypeID.tolist() all_datasetTypes = rec_datasetTypes.copy() all_datasetTypes.extend(man_datasetTypes) #-Get summary table for the lowflow sites site_summ = mssql.rd_sql(self.server, self.database, ts_summ_table, where_in={'DatasetTypeID': all_datasetTypes, 'ExtSiteID': flow_sites}) site_summ.FromDate = pd.to_datetime(site_summ.FromDate) site_summ.ToDate = pd.to_datetime(site_summ.ToDate) site_summ.drop('ModDate', axis=1, inplace=True) #-Throw out sites from summary table and lowflow sites that have less records than 'min_flow_obs' too_short = site_summ.loc[site_summ.Count<self.min_flow_obs, ['ExtSiteID','Count']] if len(too_short)>0: for j in too_short.iterrows(): print('ExtSiteID %s is not used because it only has %s records, which is less than %s.' %(j[1]['ExtSiteID'], int(j[1]['Count']), self.min_flow_obs)) site_summ = site_summ.loc[site_summ['ExtSiteID']!=j[1]['ExtSiteID']] flow_sites.remove(j[1]['ExtSiteID']) #-Get site ids for recorder and manual sites of the lowflow sites and create geodataframe for manually recorded lowflow sites rec_sites = site_summ.loc[site_summ.DatasetTypeID.isin(rec_datasetTypes),'ExtSiteID'].tolist() man_sites = site_summ.loc[site_summ.DatasetTypeID.isin(man_datasetTypes),'ExtSiteID'].tolist() man_sites_gdf = self.flow_sites_gdf[self.flow_sites_gdf.flow_site.isin(man_sites)].copy() #-get all recorder flow sites within a buffer distance from the manually recorded lowflow sites man_sites_buffer_gdf = man_sites_gdf.copy() man_sites_buffer_gdf['geometry'] = man_sites_gdf.buffer(self.buf_dist) all_recorder_flowsites = mssql.rd_sql(self.server, self.database, ts_summ_table, ['ExtSiteID'], where_in={'DatasetTypeID': rec_datasetTypes}) all_recorder_flowsites = pd.unique(all_recorder_flowsites.ExtSiteID).tolist() all_recorder_flowsites = mssql.rd_sql(self.server, self.database, site_table, ['ExtSiteID', 'NZTMX', 'NZTMY'], where_in={'ExtSiteID': all_recorder_flowsites}) all_recorder_flowsites_gdf = vector.xy_to_gpd('ExtSiteID', 'NZTMX', 'NZTMY', all_recorder_flowsites) self.rec_sites_buffer_gdf = vector.sel_sites_poly(all_recorder_flowsites_gdf, man_sites_buffer_gdf) #-write recorder sites within buffer to a shapefile self.rec_sites_buffer_gdf.to_file(os.path.join(self.results_path, self.config.get('FLOW_CORRELATIONS', 'rec_sites_shp'))) all_recorder_flowsites = None; all_recorder_flowsites_gdf = None; del all_recorder_flowsites, all_recorder_flowsites_gdf #-merge list of lowflow sites with list of recorder sites in buffer to create list of all sites for which to extract data all_flow_sites = flow_sites.copy() all_flow_sites.extend(pd.unique(self.rec_sites_buffer_gdf.ExtSiteID).tolist()) all_flow_sites = pd.DataFrame(columns=all_flow_sites) all_flow_sites = pd.unique(all_flow_sites.columns).tolist() #-Get time-series of all_flow_sites ts_df = mssql.rd_sql(self.server, self.database, ts_table, ['ExtSiteID', 'DatasetTypeID', 'DateTime', 'Value', 'QualityCode'], where_in={'DatasetTypeID': all_datasetTypes, 'ExtSiteID': all_flow_sites}) ts_df = ts_df.loc[ts_df.QualityCode>100] #-everything otherwise than missing data ts_df.DateTime = pd.to_datetime(ts_df.DateTime) ts_df_sites = pd.unique(ts_df.ExtSiteID).tolist() all_flow_sites = None; del all_flow_sites #-Check if there is data for all lowflow sites for j in flow_sites: if j not in ts_df_sites: print('There is zero flow data for site %s, or record of observations for this site was too short.' %j) #-Minimum and maximum date of all data min_date = ts_df.DateTime.min() max_date = ts_df.DateTime.max() #-Fill dataframe with data for min_date through max_date df_final = pd.DataFrame(index=pd.date_range(min_date, max_date, freq='D'), columns=ts_df_sites) df_final.rename_axis('DateTime', inplace=True) for j in ts_df_sites: df_short = ts_df.loc[ts_df.ExtSiteID==j, ['DateTime', 'Value','QualityCode']] #-keep only the records with the highest qualitycode df_short_group = df_short.groupby(['DateTime', 'QualityCode']).mean() df_short_group.reset_index(inplace=True) df_short_group.sort_values(by=['DateTime', 'QualityCode'], inplace=True) df_short_group.drop_duplicates(subset='DateTime', keep='last', inplace=True) df_short_group.drop('QualityCode', inplace=True, axis=1) df_short_group.set_index('DateTime', inplace=True) df_short_group.rename(columns={'Value':j}, inplace=True) df_final[[j]] = df_short_group ts_df = None; df_short_group = None; del ts_df, df_short_group df_final.dropna(how='all', inplace=True) #-create a copy of all gauged and recorded flow (before filtering) to make it accessible in the main class self.flow_ts_df = df_final.copy() self.flow_ts_df.to_csv(os.path.join(self.results_path, self.config.get('FLOW_CORRELATIONS', 'flow_ts_csv'))) #-remove zero records df_final[df_final==0] = np.nan #-remove unreliable values using lists of stations and dates if remove_stats and remove_dates: df_final.reset_index(inplace=True) i = 0 for s in remove_stats: stat_date = remove_dates[i] df_final.loc[(df_final['DateTime']==stat_date), s] = np.nan i+=1 df_final.set_index('DateTime', inplace=True) #-keep only winter flows for estimating correlations if self.filter_winter_flow: df_final.reset_index(inplace=True) df_final['Month'] = df_final['DateTime'].dt.strftime('%m').astype(np.int) df_final.set_index('DateTime', inplace=True) df_final = df_final.loc[(df_final.Month>4) & (df_final.Month<10)] df_final.dropna(how='all', inplace=True) #-loop over the lowflow sites and calculate regressions df_regression = pd.DataFrame(columns=['y', 'x', 'mean_y', 'mean_x', 'nobs', 'rmse', 'Adj. R2-squared', 'p-value', 'slope', 'intercept', 'power', 'fittype']) i=0 for j in flow_sites: for s in self.rec_sites_buffer_gdf.ExtSiteID.tolist(): sel_df = df_final[[j, s]] sel_df.replace(0., np.nan, inplace=True) sel_df.dropna(inplace=True) #-only fit for sites that have the minimum number of observations if len(sel_df)>=self.min_flow_obs: try: #-linear regression #-set x and y x = sel_df[[s]] x = sm.add_constant(x) #-needed for intercept y = sel_df[[j]] #-linear fit model = sm.OLS(y, x).fit() #-predict using the model predictions = model.predict(x) # make the predictions by the model rmse_val = np.sqrt( np.mean( (predictions.to_numpy() - y.to_numpy())**2 ) ) #-fill dataframe with stats for linear fit df_regression.loc[i, 'y'] = j df_regression.loc[i, 'x'] = s df_regression.loc[i, 'mean_y'] = np.nanmean(y) df_regression.loc[i, 'mean_x'] = np.nanmean(x[s].to_numpy()) df_regression.loc[i, 'nobs'] = model.nobs df_regression.loc[i, 'rmse'] = rmse_val df_regression.loc[i, 'Adj. R2-squared'] = model.rsquared_adj df_regression.loc[i, 'p-value'] = model.f_pvalue df_regression.loc[i, 'slope'] = model.params[1] df_regression.loc[i, 'intercept'] = model.params[0] df_regression.loc[i, 'fittype'] = 'linear' i+=1 except: print('Could not establish linear fit for %s and %s...' %(j,s)) try: #-power fit regression #-set x and y x = sel_df[[s]] x = sm.add_constant(x) #-needed for intercept y = sel_df[[j]] #-*********************************************************** #- y = ax^b #- log(y) = log(a) + b*log(x) #-*********************************************************** logY = np.log(y) logX = np.log(x[[s]]) logX = sm.add_constant(logX) model = sm.OLS(logY, logX).fit() x = x[s].to_numpy() #-predictions of the exact x values for calculating rmse predictions = np.exp(model.params[0]) * x**model.params[1] rmse_val = np.sqrt( np.mean( (predictions - y.to_numpy())**2 ) ) #-fill dataframe with stats for power fit df_regression.loc[i, 'y'] = j df_regression.loc[i, 'x'] = s df_regression.loc[i, 'mean_y'] = np.nanmean(y) df_regression.loc[i, 'mean_x'] = np.nanmean(x) df_regression.loc[i, 'nobs'] = model.nobs df_regression.loc[i, 'rmse'] = rmse_val df_regression.loc[i, 'Adj. R2-squared'] = model.rsquared_adj df_regression.loc[i, 'p-value'] = model.f_pvalue df_regression.loc[i, 'slope'] = np.exp(model.params[0]) df_regression.loc[i, 'power'] = model.params[1] df_regression.loc[i, 'fittype'] = 'power' i+=1 except: print('Could not establish power fit for %s and %s...' %(j,s)) sel_df = None; del sel_df #-remove negative correlations df_regression.loc[df_regression['Adj. R2-squared']<0.,:] = np.nan df_regression.dropna(how='all', inplace=True) #-loop over lowflow sites, and select best six fits by sorting on R2, rmse, nobs self.best_regressions_df = pd.DataFrame(columns=df_regression.columns) for j in flow_sites: sel_df = df_regression.loc[df_regression.y == j] sel_df.sort_values(by=['Adj. R2-squared', 'rmse', 'nobs'], ascending=[False, True, False], inplace=True) sel_df = sel_df.iloc[0:6,:] self.best_regressions_df = pd.concat([self.best_regressions_df, sel_df]) sel_df = None; del sel_df self.best_regressions_df.to_csv(os.path.join(self.results_path, self.config.get('FLOW_CORRELATIONS', 'correlations_csv')), index=False) #-make 6 plots for each site to visually pick the best unique_y = pd.unique(self.best_regressions_df['y']) unique_y = list(unique_y) for j in flow_sites: if j in unique_y: #-get regression results for site j sel_df_regression = self.best_regressions_df.loc[self.best_regressions_df.y == j] #-plot the best 6 fits fig = plt.figure(figsize=(11,8)) kk=1 for k in sel_df_regression.iterrows(): try: ax1 = plt.subplot(3,2,kk) xSite = k[1]['x'] ySite = k[1]['y'] xy = df_final[[xSite, ySite]] xy.dropna(inplace=True) x = xy[xSite].to_numpy() y = xy[ySite].to_numpy() xmin = np.nanmin(x) xmax = np.nanmax(x) dx = (xmax-xmin)/100 xvals = np.arange(xmin, xmax, dx) ymin = np.nanmin(y) ymax = np.nanmax(y) #-Get the stats from the table fit_type = k[1]['fittype'] if fit_type == 'linear': eq_str = 'y = %.3fx + %.3f' %(k[1]['slope'], k[1]['intercept']) predictions = k[1]['slope'] * xvals + k[1]['intercept'] else: eq_str = 'y = %.3fx$^{%.3f}$' %(k[1]['slope'], k[1]['power']) predictions = k[1]['slope'] * (xvals ** k[1]['power']) r2 = k[1]['Adj. R2-squared'] rmse_val = k[1]['rmse'] p_val = k[1]['p-value'] nobs = k[1]['nobs'] plt.plot(xvals, predictions, label='fit') plt.scatter(x,y, color='red', label='data') ax1.grid(True) ax1.set_xlabel('Flow at %s [m$^3$ s$^{-1}$]' %xSite) ax1.set_ylabel('Flow at %s [m$^3$ s$^{-1}$]' %ySite) ax1.legend(loc='lower right') dy = (ymax-ymin)/10 dx = (xmax-xmin)/10 #-plot the stats in the plot plt.text(xmin, ymax, eq_str) plt.text(xmin, ymax-dy, 'R$^2$ %.2f' %r2) plt.text(xmin, ymax-2*dy, 'RMSE %.2f' %rmse_val) plt.text(xmin, ymax-3*dy, 'p-value %.2f' %p_val) plt.text(xmin, ymax-4*dy, 'nobs %.0f' %nobs) ax1.set_xlim([xmin-dx, xmax+dx]) ax1.set_ylim([ymin-dy, ymax+dy]) kk+=1 except: pass fig.tight_layout() if self.filter_winter_flow: plt.savefig(os.path.join(self.results_path, '%s_correlations_winteronly.png' %ySite), dpi=300) else: plt.savefig(os.path.join(self.results_path, '%s_correlations.png' %ySite), dpi=300)
sites_buffer_gdf = flow_sites_gdf.copy() sites_buffer_gdf['geometry'] = sites_buffer_gdf.buffer(60000) all_gwl_sites = mssql.rd_sql( server, database, ts_summ_table, ['ExtSiteID'], where_in={'DatasetTypeID': datasets.DatasetTypeID.tolist()}) all_gwl_sites = pd.unique(all_gwl_sites.ExtSiteID).tolist() all_gwl_sites = mssql.rd_sql(server, database, site_table, ['ExtSiteID', 'NZTMX', 'NZTMY'], where_in={'ExtSiteID': all_gwl_sites}) all_gwl_sites_gdf = vector.xy_to_gpd('ExtSiteID', 'NZTMX', 'NZTMY', all_gwl_sites) sel_gwl_sites_gdf = vector.sel_sites_poly(all_gwl_sites_gdf, sites_buffer_gdf) all_gwl_sites = None all_gwl_sites_gdf = None #-Get summary table for the gwl sites and filter period of interest site_summ = mssql.rd_sql(server, database, ts_summ_table, where_in={ 'DatasetTypeID': datasets.DatasetTypeID.tolist(), 'ExtSiteID': sel_gwl_sites_gdf.ExtSiteID.tolist() }) site_summ.FromDate = pd.to_datetime(site_summ.FromDate) site_summ.ToDate = pd.to_datetime(site_summ.ToDate) site_summ.drop('ModDate', axis=1, inplace=True) site_summ = site_summ.loc[(site_summ.FromDate < to_date)
def rd_niwa_vcsn( mtypes, sites, nc_path=r'\\fileservices02\ManagedShares\Data\VirtualClimate\vcsn_precip_et_2016-06-06.nc', vcsn_sites_csv=r'\\fileservices02\ManagedShares\Data\VirtualClimate\GIS\niwa_vcsn_wgs84.csv', id_col='Network', x_col='deg_x', y_col='deg_y', buffer_dis=0, include_sites=False, from_date=None, to_date=None, out_crs=None, netcdf_out=None): """ Function to read in the NIWA vcsn netcdf file and output the data as a dataframe. mtypes -- A string or list of the measurement types (either 'precip', or 'PET').\n sites -- Either a list of vcsn site names or a polygon of the area of interest.\n nc_path -- The path to the vcsn nc file.\n vcsn_sites_csv -- The csv file that relates the site name to coordinates.\n id_col -- The site name column in vcsn_sites_csv.\n x_col - The x column name in vcsn_sites_csv.\n y_col -- The y column name in vcsn_sites_csv.\n include_sites -- Should the site names be added to the output?\n out_crs -- The crs epsg number for the output coordinates if different than the default WGS85 (e.g. 2193 for NZTM). """ mtype_name = {'precip': 'rain', 'PET': 'pe'} ### Import and reorganize data vcsn_sites = pd.read_csv(vcsn_sites_csv)[[id_col, x_col, y_col]] if isinstance(sites, str): if sites.endswith('.shp'): sites_gpd = vector.xy_to_gpd(id_col, x_col, y_col, vcsn_sites, 4326) poly1 = gpd.read_file(sites) sites_gpd2 = sites_gpd.to_crs(poly1.crs) ### Select sites sites2 = vector.sel_sites_poly(sites_gpd2, poly1, buffer_dis)[id_col] elif isinstance(sites, (list, pd.Series, np.ndarray)): sites2 = sites ### Select locations site_loc1 = vcsn_sites[vcsn_sites[id_col].isin(sites2)] site_loc1.columns = ['id', 'x', 'y'] ### Select mtypes if isinstance(mtypes, str): mtypes1 = [mtype_name[mtypes]] else: mtypes1 = [mtype_name[i] for i in mtypes] if include_sites: mtypes1.extend(['site']) ### Read and extract data from netcdf files ds1 = xr.open_dataset(nc_path) time1 = pd.to_datetime(ds1.time.values) if isinstance(from_date, str): time1 = time1[time1 >= from_date] if isinstance(to_date, str): time1 = time1[time1 <= to_date] lat1 = ds1.latitude.values lon1 = ds1.longitude.values lat2 = lat1[np.in1d(lat1, site_loc1.y.unique())] lon2 = lon1[np.in1d(lon1, site_loc1.x.unique())] ds2 = ds1.loc[{'longitude': lon2, 'time': time1.values, 'latitude': lat2}] ds3 = ds2[mtypes1] ### Convert to DataFrame df1 = ds3.to_dataframe().reset_index() df1.rename(columns={'latitude': 'y', 'longitude': 'x'}, inplace=True) df1 = df1.dropna() ### Convert to different crs if needed if out_crs is not None: crs1 = util.convert_crs(out_crs) new_gpd1 = vector.xy_to_gpd('id', 'x', 'y', site_loc1, 4326) new_gpd2 = new_gpd1.to_crs(crs1) site_loc2 = site_loc1.copy() site_loc2['x_new'] = new_gpd2.geometry.apply(lambda j: j.x) site_loc2['y_new'] = new_gpd2.geometry.apply(lambda j: j.y) df2 = pd.merge(df1, site_loc2[['x', 'y', 'x_new', 'y_new']], on=['x', 'y']) df3 = df2.drop(['x', 'y'], axis=1).rename(columns={ 'x_new': 'x', 'y_new': 'y' }) col_order = ['y', 'x', 'time'] col_order.extend(mtypes1) df4 = df3[col_order] else: df4 = df1 ds1.close() ds3.close() ### Return if isinstance(netcdf_out, str): ds3.to_netcdf(netcdf_out) return df4
usage_output = 'pareora_huts_usage_mon.csv' ############################################ ### Extract data site_filter = {'CatchmentGroupName': catch_group} a1 = AlloUsage(from_date, to_date, site_filter=site_filter) sites0 = a1.sites.reset_index().copy() sites = vector.xy_to_gpd('wap', 'NZTMX', 'NZTMY', sites0) catch_del = gpd.read_file(catch_del_shp_path) catch_del.rename(columns={'SITENUMBER': 'site'}, inplace=True) catch_del1 = catch_del[catch_del.site == rec_site] sites1 = vector.sel_sites_poly(sites, catch_del1) ## Usage data usage1 = pd.read_hdf(os.path.join(source_path, usage_hdf)) usage2 = usage1[usage1.wap.isin(sites1.wap)].copy() usage2['time'] = pd.to_datetime(usage2['time']) usage2.to_csv(os.path.join(base_path, shp_dir, usage_output), index=False) usage3 = usage2.groupby('time')['sd_usage'].sum()
rec_rivers_dict = sql1.get_dict(rec_rivers_sql) rec_catch_dict = sql1.get_dict(rec_catch_sql) ################################### ### Catchment delineation and WAPs catch1 = rec.catch_delineate(sites2, rec_rivers_dict, rec_catch_dict) catch1.to_file(os.path.join(results_path, catch_del_shp)) wap1 = mssql.rd_sql(server, database, crc_wap_table, ['wap']).wap.unique() sites3 = sites1[sites1.ExtSiteID.isin(wap1)].copy() sites4 = vector.xy_to_gpd('ExtSiteID', 'NZTMX', 'NZTMY', sites3) sites5 = vector.sel_sites_poly(sites4, catch1) sites5.to_file(os.path.join(results_path, wap_shp)) ################################## ### Get crc data allo1 = AlloUsage(crc_wap_filter={'wap': sites5.ExtSiteID.tolist()}, from_date=from_date, to_date=to_date) #allo1.allo[allo1.allo.crc_status == 'Terminated - Replaced'] allo1.allo_wap.to_csv(os.path.join(results_path, allo_wap_csv)) allo1.allo.to_csv(os.path.join(results_path, allo_csv))
'Catchment', 'DateTime of forecast', 'MetService product', 'Catchment precipitation [mm]', 'Month' ]) df_catchment_avg_final = pd.concat([ df_catchment_avg_final, pd.DataFrame(columns=[i for i in range(1, 85 + 1)]) ], axis=1) ###-now per catchment for catch in unique_catchments: print('Processing %s' % catch) #-Select only the geopandas dataframe of the catchment of interest sel_catch = catchment_gdf.loc[catchment_gdf['CATCH_NAME'] == catch] #-Select the stations that can be found within this catchment sel_stats = vector.sel_sites_poly(stations_gdf, sel_catch) #-Select only the precipitation data for the stations part of that catchment df_catchment = df.loc[df.ExtSiteID.isin(sel_stats.ExtSiteID.tolist())] df_catchment.insert(0, 'Catchment', catch) #-Create a copy of the catchment df that contains the stations to be used for averaging df_catchment_avg = df_catchment.copy() df_catchment_avg.drop('ExtSiteID', axis=1, inplace=True) #-Average precipitation station and forecast values df_catchment_avg = df_catchment_avg.groupby( 'DateTime of forecast').mean() df_catchment_avg.reset_index(inplace=True) df_catchment_avg.insert(1, 'MetService product', fprod) df_catchment_avg.insert(0, 'Catchment', catch) #-Add columns for calculating percentual difference for individual stations df_catchment = pd.concat([df_catchment, df_proc], axis=1)
def flow_est(self, buffer_dis=50000): """ Function to query and/or estimate flow at the input_sites. Parameters ---------- buffer_dis : int The search radius for the regressions in meters. Returns ------- DataFrame of Flow """ ### Read data if it exists if self.input_summ.CollectionType.isin(['Recorder']).any(): rec_summ1 = self.input_summ[self.input_summ.CollectionType.isin(['Recorder'])].copy() rec_ts_data1 = mssql.rd_sql_ts(self.ts_server, param['input']['ts_database'], param['input']['ts_table'], ['ExtSiteID', 'DatasetTypeID'], 'DateTime', 'Value', from_date=self.from_date, to_date=self.to_date, where_in={'ExtSiteID': rec_summ1.ExtSiteID.tolist(), 'DatasetTypeID': rec_summ1.DatasetTypeID.unique().tolist()}).reset_index() rec_ts_data1 = pd.merge(rec_summ1[['ExtSiteID', 'DatasetTypeID']], rec_ts_data1, on=['ExtSiteID', 'DatasetTypeID']).drop('DatasetTypeID', axis=1).set_index(['ExtSiteID', 'DateTime']) rec_ts_data2 = rec_ts_data1.Value.unstack(0) else: rec_ts_data2 = pd.DataFrame() ### Run correlations if necessary if self.input_summ.CollectionType.isin(['Manual Field']).any(): man_summ1 = self.input_summ[self.input_summ.CollectionType.isin(['Manual Field'])].copy() man_sites1 = self.sites_gdf[self.sites_gdf.ExtSiteID.isin(man_summ1.ExtSiteID)].copy() ## Determine which sites are within the buffer of the manual sites buff_sites_dict = {} man_buff1 = man_sites1.set_index(['ExtSiteID']).copy() man_buff1['geometry'] = man_buff1.buffer(buffer_dis) rec_sites_gdf = self.sites_gdf[self.sites_gdf.CollectionType == 'Recorder'].copy() for index in man_buff1.index: buff_sites1 = vector.sel_sites_poly(rec_sites_gdf, man_buff1.loc[[index]]) buff_sites_dict[index] = buff_sites1.ExtSiteID.tolist() buff_sites_list = [item for sublist in buff_sites_dict.values() for item in sublist] buff_sites = set(buff_sites_list) ## Pull out recorder data needed for all manual sites man_ts_data1 = mssql.rd_sql_ts(self.ts_server, param['input']['ts_database'], param['input']['ts_table'], ['ExtSiteID', 'DatasetTypeID'], 'DateTime', 'Value', from_date=self.from_date, to_date=self.to_date, where_in={'ExtSiteID': man_summ1.ExtSiteID.tolist(), 'DatasetTypeID': man_summ1.DatasetTypeID.unique().tolist()}).reset_index() man_ts_data1 = pd.merge(man_summ1[['ExtSiteID', 'DatasetTypeID']], man_ts_data1, on=['ExtSiteID', 'DatasetTypeID']).drop('DatasetTypeID', axis=1).set_index(['ExtSiteID', 'DateTime']) man_ts_data2 = man_ts_data1.Value.unstack(0) man_rec_summ1 = self.summ[self.summ.ExtSiteID.isin(buff_sites)].copy() man_rec_ts_data1 = mssql.rd_sql_ts(self.ts_server, param['input']['ts_database'], param['input']['ts_table'], ['ExtSiteID', 'DatasetTypeID'], 'DateTime', 'Value', from_date=self.from_date, to_date=self.to_date, where_in={'ExtSiteID': man_rec_summ1.ExtSiteID.tolist(), 'DatasetTypeID': man_rec_summ1.DatasetTypeID.unique().tolist()}).reset_index() man_rec_ts_data1 = pd.merge(man_rec_summ1[['ExtSiteID', 'DatasetTypeID']], man_rec_ts_data1, on=['ExtSiteID', 'DatasetTypeID']).drop('DatasetTypeID', axis=1).set_index(['ExtSiteID', 'DateTime']) man_rec_ts_data2 = man_rec_ts_data1.Value.unstack(0).interpolate('time', limit=10) ## Run through regressions reg_lst = [] new_lst = [] for key, lst in buff_sites_dict.items(): man_rec_ts_data3 = man_rec_ts_data2.loc[:, lst].copy() man_rec_ts_data3[man_rec_ts_data3 <= 0] = np.nan man_ts_data3 = man_ts_data2.loc[:, [key]].copy() man_ts_data3[man_ts_data3 <= 0] = np.nan lm1 = LM(man_rec_ts_data3, man_ts_data3) res1 = lm1.predict(n_ind=1, x_transform='log', y_transform='log', min_obs=self.min_gaugings) res1_f = res1.summary_df['f value'].iloc[0] if res1 is None: continue res2 = lm1.predict(n_ind=2, x_transform='log', y_transform='log', min_obs=self.min_gaugings) if res2 is not None: res2_f = res2.summary_df['f value'].iloc[0] else: res2_f = 0 f = [res1_f, res2_f] val = f.index(max(f)) if val == 0: reg_lst.append(res1.summary_df) s1 = res1.summary_df.iloc[0] d1 = man_rec_ts_data3[s1['x sites']].copy() d1[d1 <= 0] = 0.001 new_data1 = np.exp(np.log(d1) * float(s1['x slopes']) + float(s1['y intercept'])) new_data1.name = key new_data1[new_data1 <= 0] = 0 else: reg_lst.append(res2.summary_df) s1 = res2.summary_df.iloc[0] x_sites = s1['x sites'].split(', ') x_slopes = [float(s) for s in s1['x slopes'].split(', ')] intercept = float(s1['y intercept']) d1 = man_rec_ts_data3[x_sites[0]].copy() d1[d1 <= 0] = 0.001 d2 = man_rec_ts_data3[x_sites[1]].copy() d2[d2 <= 0] = 0.001 new_data1 = np.exp((np.log(d1) * float(x_slopes[0])) + (np.log(d2) * float(x_slopes[1])) + intercept) new_data1.name = key new_data1[new_data1 <= 0] = 0 new_lst.append(new_data1) new_data2 = pd.concat(new_lst, axis=1) reg_df = pd.concat(reg_lst).reset_index() else: new_data2 = pd.DataFrame() reg_df = pd.DataFrame() flow = pd.concat([rec_ts_data2, new_data2], axis=1).round(3) ## Save if required if hasattr(self, 'output_path'): run_time = pd.Timestamp.today().strftime('%Y-%m-%dT%H%M') if not reg_df.empty: reg_flow_csv = param['output']['reg_flow_csv'].format(run_date=run_time) reg_df.to_csv(os.path.join(self.output_path, reg_flow_csv), index=False) flow_csv = param['output']['flow_csv'].format(run_date=run_time) flow.to_csv(os.path.join(self.output_path, flow_csv)) setattr(self, 'flow', flow) setattr(self, 'reg_flow', reg_df) return flow
def test_sel_sites_poly(): pts1 = vector.sel_sites_poly(sites_shp_path, rec_catch_shp_path, buffer_dis=10) assert (len(pts1) == 2) & isinstance(pts1, gpd.GeoDataFrame)
sites_rec_bool = flow_sites_gdf.FlowSite.isin(rec_summ1.ExtSiteID.unique()) sites_rec1 = flow_sites_gdf[sites_rec_bool].copy() sites_man1 = flow_sites_gdf[~sites_rec_bool].copy() flow_rec_sites1 = mssql.rd_sql(server, database, site_table, ['ExtSiteID', 'NZTMX', 'NZTMY'], where_in={'ExtSiteID': rec_summ1.ExtSiteID.unique().tolist()}) flow_rec_sites2 = vector.xy_to_gpd('ExtSiteID', 'NZTMX', 'NZTMY', flow_rec_sites1) ## Estimate flow where recorder doesn't exist sites_man2 = sites_man1.copy() sites_man2['geometry'] = sites_man1.buffer(buffer_dis) rec_sites2 = vector.sel_sites_poly(flow_rec_sites2, sites_man2) rec_ts_data1 = mssql.rd_sql_ts(server, database, ts_table, 'ExtSiteID', 'DateTime', 'Value', from_date=param['from_date'], to_date=param['to_date'], where_in={'ExtSiteID': rec_sites2.ExtSiteID.tolist(), 'DatasetTypeID': rec_summ1.DatasetTypeID.unique().tolist()}) rec_ts_data2 = rec_ts_data1.Value.unstack(0).interpolate('time', limit=10).dropna(axis=1) rec_flow1 = rec_ts_data2.loc[:, rec_ts_data2.columns.isin(sites_rec1.FlowSite)].copy() man_ts_data1 = mssql.rd_sql_ts(server, database, ts_table, 'ExtSiteID', 'DateTime', 'Value', from_date=param['from_date'], to_date=param['to_date'], where_in={'ExtSiteID': sites_man1.FlowSite.tolist(), 'DatasetTypeID': man_datasets}) man_ts_data2 = man_ts_data1.Value.unstack(0) reg_lst = [] new_lst = [] for col in man_ts_data2: