def plot_wateruse(wel_files, perioddata, add_data=None, wel_flux_col='q', model_volume_units='$m^3$', model_time_units='day', plot_volume_units='mgal', plot_time_units='day', outfile=None): """ Parameters ---------- wel_files : A head line with column names is assumed. For example: #k,i,j,q,boundname perioddata : add_data : model_volume_units : model_time_units : plot_volume_units : plot_time_units : Returns ------- """ # read the stress period information if not isinstance(perioddata, pd.DataFrame): perioddata = pd.read_csv(perioddata) else: perioddata = perioddata.copy() perioddata.index = perioddata['per'] dfs = [] for i, f in wel_files.items(): df = pd.read_csv(f, delim_whitespace=True) df.columns = [c.strip('#') for c in df.columns] df['per'] = i df['start_datetime'] = perioddata.loc[i, 'start_datetime'] df['end_datetime'] = perioddata.loc[i, 'end_datetime'] dfs.append(df) df = pd.concat(dfs) # sum the model pumping by stress period period_sums = df.groupby('per').first() period_sums[wel_flux_col] = df.groupby('per')[wel_flux_col].sum() # fill nan values (from any periods without wel files) with 0s period_sums = period_sums.reindex(range(period_sums.index.max())) period_sums['start_datetime'] = perioddata['start_datetime'] period_sums['end_datetime'] = perioddata['end_datetime'] period_sums[wel_flux_col].fillna(0, inplace=True) period_sums.index = pd.to_datetime(period_sums['start_datetime']) period_sums['WEL package input'] = period_sums['q'] period_sums = period_sums[['WEL package input', 'start_datetime', 'end_datetime']] # convert units model_vol_conv = convert_volume_units(model_volume_units, plot_volume_units) model_time_conv = convert_time_units(model_time_units, plot_time_units) model_conv = model_vol_conv * model_time_conv # plot any additional comparison data if add_data is not None: for label, items in add_data.items(): # read the stress period information if not isinstance(items['data'], pd.DataFrame): items['data'] = pd.read_csv(items['data']) req_cols = {'q', 'start_datetime'} assert not req_cols.difference(items['data'].columns), \ f"add_data: {label} data must have columns: {req_cols}" items['data']['start_datetime'] = pd.to_datetime(items['data']['start_datetime']) aux_period_sums = items['data'].groupby('start_datetime').first() aux_period_sums[label] = items['data'].groupby('start_datetime')['q'].sum() # fill nan values (from any periods without wel files) with 0s #aux_period_sums[label].fillna(0, inplace=True) aux_period_sums['start_datetime'] = aux_period_sums.index period_sums = period_sums.join(aux_period_sums[[label]], how='outer') j=2 # forward fill nan WEL values values # (where other times may have been inserted) period_sums['WEL package input'] = period_sums['WEL package input'].ffill() #period_sums = period_sums.resample('M').mean() #.ffill() # make a plot fig, ax = plt.subplots(figsize=(11, 8.5)) ax = period_sums.plot(ax=ax) units_text = f'{model_volume_units}/{model_time_units}' ax.set_ylabel(f'Pumpage, in {units_text}') ax.set_xlabel('') # second axis with another volume unit def second_axis_conversion(x): return x * model_conv def second_axis_conversion_r(x): return x * 1 / model_conv ax2 = ax.secondary_yaxis('right', functions=(second_axis_conversion, second_axis_conversion_r)) ax2.set_ylabel(f'Pumpage, in {plot_volume_units}/{plot_time_units}') #format_xtick_labels(period_sums, ax, maxlabels=30, date_format='%Y-%m-%d') h, l = ax.get_legend_handles_labels() means = (period_sums.mean(axis=0) * model_conv).to_dict() plot_units_text = f'{plot_volume_units}/{plot_time_units}' labels_with_means = [] for label in l: new_label = label if label in means: new_label += f' (mean: {means[label]:g} {plot_units_text})' labels_with_means.append(new_label) ax.legend(h, labels_with_means) if outfile is not None: Path(outfile).parent.mkdir(parents=True, exist_ok=True) plt.savefig(outfile) plt.close() print(f'wrote {outfile}') else: return ax
def preprocess_flows( data, metadata=None, flow_data_columns=['flow'], start_date=None, active_area=None, active_area_id_column=None, active_area_feature_id=None, source_crs=4269, dest_crs=5070, datetime_col='datetime', site_no_col='site_no', line_id_col='line_id', x_coord_col='x', y_coord_col='y', name_col='name', flow_qualifier_column=None, default_qualifier='measured', include_sites=None, include_line_ids=None, source_volume_units='ft3', source_time_units='s', dest_volume_units='m3', dest_time_units='d', geographic_groups=None, geographic_groups_col=None, max_obsname_len=None, add_leading_zeros_to_sw_site_nos=False, column_renames=None, outfile=None, ): """Preprocess stream flow observation data, for example, from NWIS or another data source that outputs time series in CSV format with site locations and identifiers. * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates) to the CRS of the model (`dest_crs`) * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area * length and time units are converted to those of the groundwater model. * Prefixes for observation names (with an optional length limit) that identify the location are generated * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons (`geographic_groups` parameter) Parameters ---------- data : csv file or DataFrame Time series of stream flow observations. Columns: ===================== ====================================== site_no site identifier datetime measurement dates/times x x-coordinate of site y y-coordinate of site flow_data_columns Columns of observed streamflow values flow_qualifier_column Optional column with qualifiers for flow values ===================== ====================================== Notes: * x and y columns can alternatively be in the metadata table * flow_data_columns are denoted in `flow_data_columns`; multiple columns can be included to process base flow and total flow, or other statistics in tandem * For example, `flow_qualifier_column` may have "estimated" or "measured" flags denoting whether streamflows were derived from measured values or statistical estimates. metadata : csv file or DataFrame Stream flow observation site information. May include columns: ================= ================================================================================ site_no site identifier x x-coordinate of site y y-coordinate of site name name of site line_id_col Identifier for a line in a hydrography dataset that the site is associated with. ================= ================================================================================ Notes: * other columns in metadata will be passed through to the metadata output flow_data_columns : list of strings Columns in data with flow values or their statistics. By default, ['q_cfs'] start_date : str (YYYY-mm-dd) Simulation start date (cull observations before this date) active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. source_crs : obj Coordinate reference system of the head observation locations. A Python int, dict, str, or :class:`pyproj.crs.CRS` instance passed to :meth:`pyproj.crs.CRS.from_user_input` Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class By default, epsg:4269 dest_crs : obj Coordinate reference system of the model. Same input types as ``source_crs``. By default, epsg:5070 datetime_col : str, optional Column name in data with observation date/times, by default 'datetime' site_no_col : str, optional Column name in data and metadata with site identifiers, by default 'site_no' line_id_col : str, optional Column name in data or metadata with identifiers for hydrography lines associated with observation sites. by default 'line_id' x_coord_col : str, optional Column name in data or metadata with x-coordinates, by default 'x' y_coord_col : str, optional Column name in data or metadata with y-coordinates, by default 'y' name_col : str, optional Column name in data or metadata with observation site names, by default 'name' flow_qualifier_column : str, optional Column name in data with flow observation qualifiers, such as "measured" or "estimated" by default 'category' default_qualifier : str, optional Default qualifier to populate flow_qualifier_column if it is None. By default, "measured" include_sites : list-like, optional Exclude output to these sites. by default, None (include all sites) include_line_ids : list-like, optional Exclude output to these sites, represented by line identifiers. by default, None (include all sites) source_volume_units : str, 'm3', 'cubic meters', 'ft3', etc. Volume units of the source data. By default, 'ft3' source_time_units : str, 's', 'seconds', 'days', etc. Time units of the source data. By default, 's' dest_volume_units : str, 'm3', 'cubic meters', 'ft3', etc. Volume units of the output (model). By default, 'm3' dest_time_units : str, 's', 'seconds', 'days', etc. Time units of the output (model). By default, 'd' geographic_groups : file, dict or list-like Option to group observations by area(s) of interest. Can be a shapefile, list of shapefiles, or dictionary of shapely polygons. A 'group' column will be created in the metadata, and observation sites within each polygon will be assigned the group name associated with that polygon. For example:: geographic_groups='../source_data/extents/CompositeHydrographArea.shp' geographic_groups=['../source_data/extents/CompositeHydrographArea.shp'] geographic_groups={'cha': <shapely Polygon>} Where 'cha' is an observation group name for observations located within the the area defined by CompositeHydrographArea.shp. For shapefiles, group names are provided in a `geographic_groups_col`. geographic_groups_col : str Field name in the `geographic_groups` shapefile(s) containing the observation group names associated with each polygon. max_obsname_len : int or None Maximum length for observation name prefix. Default of 13 allows for a PEST obsnme of 20 characters or less with <prefix>_yyyydd or <prefix>_<per>d<per> (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1) If None, observation names will not be truncated. PEST++ does not have a limit on observation name length. add_leading_zeros_to_sw_site_nos : bool Whether or not to pad site numbers using the :func:~`mapgwm.swflows.format_usgs_sw_site_id` function. By default, False. column_renames : dict, optional Option to rename columns in the data or metadata that are different than those listed above. For example, if the data file has a 'SITE_NO' column instead of 'SITE_BADGE':: column_renames={'SITE_NO': 'site_no'} by default None, in which case the renames listed above will be used. Note that the renames must be the same as those listed above for :func:`mapgwm.swflows.preprocess_flows` to work. outfile : str Where output file will be written. Metadata are written to a file with the same name, with an additional "_info" suffix prior to the file extension. Returns ------- data : DataFrame Preprocessed time series metadata : DataFrame Preprocessed metadata References ---------- `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>` Notes ----- """ # outputs if outfile is not None: outpath, filename = os.path.split(outfile) makedirs(outpath) outname, ext = os.path.splitext(outfile) out_info_csvfile = outname + '_info.csv' out_data_csvfile = outfile out_shapefile = outname + '_info.shp' # read the source data if not isinstance(data, pd.DataFrame): df = pd.read_csv(data, dtype={site_no_col: object}) else: df = data.copy() # check the columns for col in [datetime_col] + flow_data_columns: assert col in df.columns, "Column {} not found in {}".format(col, data) assert any({site_no_col, line_id_col}.intersection(df.columns)), \ "Neither {} or {} found in {}. Need to specify a site_no_col or line_id_col".format(site_no_col, line_id_col, data) # rename input columns to these names, # for consistent output dest_columns = { datetime_col: 'datetime', site_no_col: 'site_no', line_id_col: 'line_id', x_coord_col: 'x', y_coord_col: 'y', name_col: 'name', flow_qualifier_column: 'category' } # update the default column renames # with any supplied via column_renames parameter if isinstance(column_renames, collections.Mapping): dest_columns.update(column_renames) df.rename(columns=dest_columns, inplace=True) flow_data_columns = [ c if c not in dest_columns else dest_columns[c] for c in flow_data_columns ] # convert site numbers to strings; # add leading 0s to any USGS sites that should have them if 'site_no' in df.columns: df['site_no'] = format_site_ids(df['site_no'], add_leading_zeros_to_sw_site_nos) else: df['site_no'] = df[line_id_col] # read the source data if metadata is not None: if not isinstance(metadata, pd.DataFrame): md = pd.read_csv(metadata, dtype={site_no_col: object}) else: md = metadata.copy() if site_no_col not in md.columns or 'site_no' not in df.columns: raise IndexError( 'If metadata are supplied, both data and metadata must ' 'have a site_no column.') md.rename(columns=dest_columns, inplace=True) md['site_no'] = format_site_ids(md['site_no'], add_leading_zeros_to_sw_site_nos) md.index = md['site_no'] by_site = df.groupby('site_no') md['start_dt'] = pd.DataFrame(by_site['datetime'].first()) else: by_site = df.groupby('site_no') md = pd.DataFrame(by_site['datetime'].first()) md.columns = ['start_dt'] md['site_no'] = md.index md['end_dt'] = pd.DataFrame(by_site['datetime'].last()) md['n'] = pd.DataFrame(by_site['datetime'].count()) md.reset_index(inplace=True, drop=True) # assign metadata if supplied for col in 'x', 'y', 'line_id', 'name': if col in df.columns and col not in md.columns: by_site_no = dict(zip(df['site_no'], df[col])) md[col] = [by_site_no[sn] for sn in md['site_no']] if col != 'line_id': df.drop(col, axis=1, inplace=True) # index the dataframe to times; # truncate data before start date df.index = pd.to_datetime(df['datetime']) df.index.name = 'datetime' df = df.loc[start_date:].copy() # project x, y to model crs x_pr, y_pr = project((md.x.values, md.y.values), source_crs, dest_crs) md['x'], md['y'] = x_pr, y_pr md['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)] # cull data to that within the model area if active_area is not None: df, md = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs, metadata=md) # get the hydrography IDs corresponding to each site # using the included lookup table #if 'line_id' not in df.columns: # assert line_id_lookup is not None, \ # "need to include line_ids in a column, or line_id_lookup dictionary mapping line_ids to site numbers" # df = df.loc[df['site_no'].isin(line_id_lookup)].copy() # df['line_id'] = [line_id_lookup[sn] for sn in df['site_no']] if include_sites is not None: md = md.loc[md.site_no.isin(include_sites)] df = df.loc[df.site_no.isin(include_sites)] if include_line_ids is not None: md = md.loc[md.line_id.isin(include_line_ids)] df = df.loc[df.line_id.isin(include_line_ids)] # convert units # ensure that flow values are numeric (may be objects if taken directly from NWIS) unit_conversion = ( convert_volume_units(source_volume_units, dest_volume_units) / convert_time_units(source_time_units, dest_time_units)) for flow_col in flow_data_columns: df[flow_col] = pd.to_numeric(df[flow_col], errors='coerce') * unit_conversion df.dropna(subset=flow_data_columns, axis=0, inplace=True) # reformat qualifiers for consistent output # (lump to dest category columns of either estimated or measured) # with measured including values derived from baseflow separation or actual measurements) # output column name for flow qualifier column: dest_flow_qualifier_column = 'category' if flow_qualifier_column is not None: flow_qualifiers = { 'calculated': 'measured', # 'measured', 'base flow separated from measured values': 'measured', # 'measured', 'measured total flow': 'measured', 'estimated gaged': 'estimated', 'estimated ungaged': 'estimated' } df[dest_flow_qualifier_column] = df[flow_qualifier_column].replace( flow_qualifiers) else: df['category'] = default_qualifier # make unique n-character prefixes (site identifiers) for each observation location # 13 character length allows for prefix_yyyymmm in 20 character observation names # (BeoPEST limit) unique_obsnames = set() obsnames = [] for sn in md['site_no'].tolist(): if max_obsname_len is not None: name = make_obsname(sn, unique_names=unique_obsnames, maxlen=max_obsname_len) assert name not in unique_obsnames else: name = sn unique_obsnames.add(name) obsnames.append(name) md['obsprefix'] = obsnames # add area of interest information md['group'] = 'fluxes' md = assign_geographic_obsgroups(md, geographic_groups, geographic_groups_col, metadata_crs=dest_crs) # data columns data_cols = ['site_no', 'line_id', 'datetime' ] + flow_data_columns + ['category'] #if 'line_id' in md.columns and 'line_id' not in df.columns: # # only map line_ids to data if there are more site numbers # # implying that no site number maps to more than one line_id # if len(set(df.site_no)) >= len(set(df.line_id)): # ids = dict(zip(md['site_no'], md['line_id'])) # df['line_id'] = [ids[sn] for sn in df['site_no']] data_cols = [c for c in data_cols if c in df.columns] df = df[data_cols] md.index = md['site_no'] # save out the results if outfile is not None: df2shp(md.drop(['x', 'y'], axis=1), out_shapefile, crs=dest_crs) print('writing {}'.format(out_info_csvfile)) md.drop('geometry', axis=1).to_csv(out_info_csvfile, index=False, float_format='%g') print('writing {}'.format(out_data_csvfile)) df.to_csv(out_data_csvfile, index=False, float_format='%g') return df, md
def assign_monthly_production(self, outfile='processed_swuds.csv'): """ Assign production wells for water use, skipping IR (irrigation) and TE (thermal electric) to production zones. If production zones are not assigned or if the well bottom doesn't fall into a production zone, then the screen_top and screen_bot are assigned using well_depth and the default screen length. Production is given in cubic m per day. todo: add unit conversion parameter so other units can be used? Parameters ---------- outfile: str path to final processed monthly water-use file with production zone information """ # fill in missing monthly values with annual value for c in self.monthly_cols: idx = self.df.loc[self.df[c].isnull()].index.values self.df.loc[idx, c] = self.df.loc[idx, 'ANNUAL_VAL'] # pull out groundwater sites that are not IR, AQ or TE self.df = self.df.loc[(self.df['WATER_CD'] == 'GW') & ~(self.df['FROM_NAT_WATER_USE_CD'] == 'IR') & ~(self.df['FROM_NAT_WATER_USE_CD'] == 'AQ') & ~(self.df['FROM_NAT_WATER_USE_CD'] == 'TE')] # reshape dataframe to have monthly values in same column stacked = pd.DataFrame(self.df[self.monthly_cols].stack()) stacked.reset_index(inplace=True) stacked.rename(columns={ 'level_1': 'month', 0: 'q_monthly' }, inplace=True) stacked.q_monthly = stacked.q_monthly stacked.index = stacked.level_0 stacked = stacked.join(self.df) keep_cols = [c for c in stacked.columns if c not in self.monthly_cols] stacked = stacked[keep_cols] month = {name: i + 1 for i, name in enumerate(self.monthly_cols)} dates = [ '{}-{:02d}'.format(year, month[month_column_name]) for year, month_column_name in zip(stacked.YEAR, stacked.month) ] stacked['datetime'] = pd.to_datetime(dates) stacked.sort_values(by=['SITE_NO', 'datetime'], inplace=True) # set start and end dates if not already set if self.start_date is None: self.start_date = stacked.datetime.min() if self.end_date is None: self.end_date = stacked.datetime.max() groups = stacked.groupby('SITE_NO') all_groups = [] for site_no, group in groups: group = group.copy() group.index = pd.to_datetime(group['datetime']) start_date = pd.Timestamp(self.start_date) end_date = pd.Timestamp(self.end_date) monthly_values_2010 = group.loc[group.datetime.dt.year == 2010] monthly_values_2010 = dict( zip(monthly_values_2010.datetime.dt.month, monthly_values_2010.q_monthly)) avg_monthly_values = group.groupby( group.index.month).mean().q_monthly.to_dict() q_mean = group.q_monthly.mean() # reindex the site data to include all months for simulation period all_dates = pd.date_range(start_date, end_date, freq='MS') group = group.reindex(all_dates) # fill empty dates q = [] for month, q_monthly in zip(group.index.month, group.q_monthly): # try to use 2010 values if they exist if np.isnan(q_monthly): q_monthly = monthly_values_2010.get(month, np.nan) # otherwise take the average value for each month if np.isnan(q_monthly): q_monthly = avg_monthly_values[month] # fill missing months with the mean value for the site if np.isnan(q_monthly): q_monthly = q_mean q.append(q_monthly) # assume most values represent abstraction # if sum is positive, invert so that output values are negative if np.sum(q) > 0: q = -np.array(q) group['q'] = q #group['q'] = group['q'] * 3785.4 # convert from mgd to cubic m per d group['q'] = group['q'] * convert_volume_units( self.data_volume_units, self.model_length_units) group['site_no'] = f'swuds_{site_no}' group['well_elev'] = self.well_elevations[site_no] group['depth'] = self.depths[site_no] well_botm_depth = self.well_elevations[site_no] - self.depths[ site_no] group['x'] = np.nanmin(group['x']) group['y'] = np.nanmin(group['y']) # assign a production zone from default dict. If the bottom of the # well does not fall in a zone, or if the dictionary is empty; then # the production zone is assigned 'unnamed' production_zone = 'unnamed' for prod_name in self.prod_zone_top.keys(): prod_zone_top = self.prod_zone_top[prod_name][site_no] prod_zone_bot = self.prod_zone_bot[prod_name][site_no] if np.isnan(prod_zone_top) or np.isnan( prod_zone_bot): # missing zone group['screen_bot'] = self.well_elevations[ site_no] - self.depths[site_no] group['screen_top'] = self.well_elevations[ site_no] - self.depths[ site_no] + self.default_screen_len group['open_int_method'] = 'well depth' else: if well_botm_depth < prod_zone_top and well_botm_depth > prod_zone_bot: production_zone = prod_name group['screen_bot'] = prod_zone_bot group['screen_top'] = prod_zone_top group['open_int_method'] = 'production zone' else: group['screen_bot'] = self.well_elevations[ site_no] - self.depths[site_no] group['screen_top'] = self.well_elevations[ site_no] - self.depths[ site_no] + self.default_screen_len group['open_int_method'] = 'well depth' group['production_zone'] = production_zone # add aquifer name group['aquifer_name'] = self.aquifer_names.get( group["FROM_AQFR_CD"].values[0], 'unnamed') cols = [ 'site_no', 'q', 'q_monthly', 'month', 'well_elev', 'depth', 'screen_bot', 'screen_top', 'x', 'y' ] all_groups.append(group[cols]) self.df = pd.concat(all_groups) self.df[ 'start_datetime'] = self.df.index # start date of each pumping period if outfile is not None: outfile = Path(outfile) self.df.to_csv(outfile, index=False) print( 'processed SWUDS data written to {0} and in dataframe attribute' .format(outfile)) self.df['geometry'] = [ Point(x, y) for x, y in zip(self.df.x, self.df.y) ] # write only unique pumping values to shapefile to_shapefile = self.df.groupby(['site_no', 'q']).first().reset_index() shapefile = outfile.with_suffix('.shp') df2shp(to_shapefile, shapefile, crs=self.dest_crs)
def test_convert_volume_units(): assert np.allclose(convert_volume_units('cubic meters', 'mgal'), 264.172 / 1e6) assert np.allclose(convert_volume_units('$m^3$', '$ft^3$'), 35.3147) assert np.allclose(convert_volume_units('cubic meters', 'cubic feet'), 35.3147) assert np.allclose(convert_volume_units('cubic feet', 'cubic meters'), 0.0283168) assert np.allclose(convert_volume_units('meters', 'feet'), 35.3147) assert np.allclose(convert_volume_units('feet', 'meters'), 0.0283168) assert np.allclose(convert_volume_units('feet3', 'm3'), 0.0283168) assert np.allclose(convert_volume_units('feet3', 'meters3'), 0.0283168) assert np.allclose(convert_volume_units('gallons', 'ft3'), 1 / 7.48052) assert np.allclose(convert_volume_units('gallons', 'm3'), (.3048**3) / 7.48052) assert np.allclose(convert_volume_units('gallons', 'acre foot'), 1 / 7.48052 / 43560) assert np.allclose(convert_volume_units('gallons', 'af'), 1 / 7.48052 / 43560) assert np.allclose(convert_volume_units('gallons', 'acre-ft'), 1 / 7.48052 / 43560) assert np.allclose(convert_volume_units('mgal', 'acre-ft'), 1e6 / 7.48052 / 43560) assert np.allclose(convert_volume_units('liters', 'gallon'), 1 / 3.78541) assert np.allclose(convert_volume_units(None, 'cubic feet'), 1.) assert np.allclose(convert_volume_units('cubic feet', None), 1.) assert np.allclose(convert_volume_units('junk', 'junk'), 1.)
def resample_pumping_rates(wu_file, wu_points, model, active_area=None, minimum_layer_thickness=2, drop_ids=None, exclude_steady_state=True, dropna=False, na_fill_value=0., verbose=False): """Read water use data from a master file generated from WDNR_wu_data.ipynb. Cull data to area of model. Convert from monthly gallons to daily averages in m3/d for model stress periods. Parameters ---------- wu_file : csv file Water use data ouput from the WDNR_wu_data.ipynb. wu_points : point shapefile Water use locations, generated in the WDNR_wu_data.ipynb Must be in same CRS as sr. model : flopy.modflow.Modflow instance Must have a valid attached .sr attribute defining the model grid. Only wells within the bounds of the sr will be retained. Sr is also used for row/column lookup. Must be in same CRS as wu_points. active_area : str (shapefile path) or shapely.geometry.Polygon Polygon denoting active area of the model. If specified, wells are culled to this area instead of the model bounding box. (default None) exclude_steady_state : bool Exclude steady-state stress periods from resampled output. (default True) minimum_layer_thickness : scalar Minimum layer thickness to have pumping. dropna : bool Flag to drop times (stress periods) where there is no data for a well na_fill_value : float If dropna == False, fill missing times (stress periods) with this value. Returns ------- wu_data : DataFrame """ assert not np.isnan(na_fill_value), "na_fill_value must be a number!" well_info, monthly_data = read_wdnr_monthly_water_use(wu_file, wu_points, model, drop_ids=drop_ids, active_area=active_area, minimum_layer_thickness=minimum_layer_thickness) print('\nResampling pumping rates in {} to model stress periods...'.format(wu_file)) if dropna: print(' wells with no data for a stress period will be dropped from that stress period.') else: print(' wells with no data for a stress period will be assigned {} pumping rates.'.format(na_fill_value)) if exclude_steady_state: perioddata = model.perioddata.loc[~model.perioddata.steady].copy() else: perioddata = model.perioddata.copy() t0 = time.time() # reindex the record at each site to the model stress periods dfs = [] for site, sitedata in monthly_data.groupby('site_no'): if site not in well_info.index: continue sitedata.index = sitedata.datetime assert not sitedata.index.duplicated().any() if dropna: site_period_data = sitedata.reindex(perioddata.start_datetime).dropna(axis=1) else: site_period_data = sitedata.reindex(perioddata.start_datetime, fill_value=na_fill_value) isna = site_period_data['site_no'] == 0. if np.any(isna): if verbose: years = set(site_period_data.loc[isna, 'year']) years = ', '.join(list(years)) print('Site {} has {} times with nans (in years {})- filling with {}s'.format(site, np.sum(isna), years, na_fill_value)) site_period_data['site_no'] = site site_period_data['year'] = site_period_data.index.year site_period_data['month'] = site_period_data.index.month site_period_data['datetime'] = site_period_data.index assert not site_period_data.isna().any().any() site_period_data.index = perioddata.index # copy stress periods and lengths from master stress period table for col in ['perlen', 'per']: site_period_data[col] = perioddata[col] # convert units from monthly gallon totals to daily model length units site_period_data['gal_d'] = site_period_data['gallons'] / site_period_data['perlen'] gal_to_model_units = convert_volume_units('gal', get_model_length_units(model))#model.dis.lenuni] site_period_data['q'] = site_period_data.gal_d * gal_to_model_units for col in ['i', 'j', 'k']: site_period_data[col] = well_info.loc[site, col] site_period_data.index = [site] * len(site_period_data) dfs.append(site_period_data[['k', 'i', 'j', 'q', 'per']]) wel_data = pd.concat(dfs) # water use fluxes should be negative if not wel_data.q.max() <= 0: wel_data.loc[wel_data.q.abs() != 0., 'q'] *= -1 wel_data['boundname'] = ['site{:d}'.format(s) for s in wel_data.index] assert not np.any(wel_data.isna()), "Nans in Well Data" print("took {:.2f}s\n".format(time.time() - t0)) return wel_data
def get_mean_pumping_rates(wu_file, wu_points, model, start_date='2012-01-01', end_date='2018-12-31', period_stats={0: 'mean'}, active_area=None, drop_ids=None, minimum_layer_thickness=2): """Read water use data from a master file generated from WDNR_wu_data.ipynb. Cull data to area of model. Convert from monthly gallons to daily averages in m3/d for model stress periods. Parameters ---------- wu_file : csv file Water use data ouput from the WDNR_wu_data.ipynb. wu_points : point shapefile Water use locations, generated in the WDNR_wu_data.ipynb Must be in same CRS as sr. model : flopy.modflow.Modflow instance Must have a valid attached .sr attribute defining the model grid. Only wells within the bounds of the sr will be retained. Sr is also used for row/column lookup. Must be in same CRS as wu_points. start_date : str (YYYY-MM-dd) Start date of time period to average. end_date : str (YYYY-MM-dd) End date of time period to average. period_stats : dict Dictionary of stats keyed by stress period. Stats include zero values, unless noted. keys : 0, 1, 2 ... values: str; indicate statistic to apply for each stress period 'mean': mean pumping for period defined by start_date and end_date '<month>': average for a month of the year (e.g. 'august'), for the for period defined by start_date and end_date minimum_layer_thickness : scalar Minimum layer thickness to have pumping. Returns ------- wu_data : DataFrame """ start_date, end_date = pd.Timestamp(start_date), pd.Timestamp(end_date) well_info, monthly_data = read_wdnr_monthly_water_use(wu_file, wu_points, model, active_area=active_area, drop_ids=drop_ids, minimum_layer_thickness=minimum_layer_thickness) if well_info is None: return # determine period for computing average pumping # make a dataframe for each stress period listed wel_data = [] for per, stat in period_stats.items(): if isinstance(stat, str): stat = stat.lower() elif isinstance(stat, list): stat, start_date, end_date = stat start_date, end_date = pd.Timestamp(start_date), pd.Timestamp(end_date) stat = stat.lower() # slice the monthly values to the period of start_date, end_date # aggregate to mean values in m3/d # (this section will need some work for generalized transient run setup) is_inperiod = (monthly_data.datetime > start_date) & (monthly_data.datetime < end_date) inperiod = monthly_data.loc[is_inperiod].copy() # compute average daily flux using the sum and number of days for each site # (otherwise each month is weighted equally) # convert units from monthly gallons to daily gallons inperiod['days'] = inperiod.datetime.dt.daysinmonth if stat == 'mean': period_data = inperiod.copy() # mean for given month (e.g. august mean) elif stat in months.keys() or stat in months.values(): period_data = inperiod.loc[inperiod.month == months.get(stat, stat)].copy() else: raise ValueError('Unrecognized input for stat: {}'.format(stat)) site_means = period_data.groupby('site_no').mean() site_sums = period_data.groupby('site_no').sum() site_means['gal_d'] = site_sums['gallons'] / site_sums['days'] # conversion to model units is based on lenuni variable in DIS package gal_to_model_units = convert_volume_units('gal', get_model_length_units(model)) site_means['q'] = site_means.gal_d * gal_to_model_units site_means['per'] = per wel_data.append(well_info[['k', 'i', 'j']].join(site_means[['q', 'per']], how='inner')) wel_data = pd.concat(wel_data, axis=0) # water use fluxes should be negative if not wel_data.q.max() <= 0: wel_data.loc[wel_data.q.abs() != 0., 'q'] *= -1 wel_data['boundname'] = ['site{:d}'.format(s) for s in wel_data.index] assert not np.any(wel_data.isna()), "Nans in Well Data" return wel_data
def preprocess_iwum_pumping(ncfile, start_date=None, end_date=None, active_area=None, active_area_id_column=None, active_area_feature_id=None, estimated_production_zone_top=None, estimated_production_zone_botm=None, flux_variable='value', nc_crs=5070, dest_crs=5070, nc_length_units='meters', estimated_production_surface_units='meters', model_length_units='meters', outfile=None): """Get pumping from the Irrigation Water Use Model (IWUM; Wilson, 2020) output and assign open interval information, using raster surfaces of the top and bottom of an estimated production zone. Parameters ---------- ncfile : file path NetCDF output from Irrigation Water Use Model start_date : str Cull data before this date. end_date : str Cull data after this date. active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. estimated_production_zone_top : file path Raster surface for assigning screen tops estimated_production_zone_botm : file path Raster surface for assigning screen bottoms flux_variable : str Varible in ncfile for pumping fluxes. Fluxes are assumed to represent total volumes for each time period. nc_crs : obj Coordinate Reference System (CRS) of ncfile. A Python int, dict, str, or pyproj.crs.CRS instance passed to the pyproj.crs.from_user_input See http://pyproj4.github.io/pyproj/stable/api/crs/crs.html#pyproj.crs.CRS.from_user_input. Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class nc_length_units : str, {'meters', 'ft', etc.} Length units of pumped volumes in ncfile estimated_production_surface_units : str, {'meters', 'ft', etc.} Length units of elevations in estimated production surface rasters. model_length_units : str, {'meters', 'ft', etc.} Length units of model. outfile : csv file for output table Returns ------- df : DataFrame Table of pumping rates in m3/day, location and open interval information. Columns: ============== ================================================ site_no index position of pumping rate in ncfile grid x x-coordinate in `dest_crs` y y-coordinate in `dest_crs` start_datetime start date of pumping period end_datetime end date of pumping period screen_top screen top elevation, in `model_length_units` screen_botm screen bottom elevation, in `model_length_units` q pumping rate, in model units geometry shapely Point object representing location ============== ================================================ Notes ----- * Time units are assumed to be days. * Fluxes are assumed to represent total volumes for each time period indicated by the differences between successive values along the time axis of ncfile. """ ds = xr.open_dataset(ncfile) time_variable = [k for k in ds.coords.keys() if k.lower() not in {'x', 'y'}][0] ds_x, ds_y = np.meshgrid(ds['x'], ds['y']) # original values are in m3, in each 1 mi2 cell # can leave in m3 if reassigning to 1km grid as point values length_conversion = convert_volume_units(nc_length_units, model_length_units) ** 3 unit_suffix = vol_suffix[model_length_units] + 'd' flux_col = 'q' # 'flux_{}'.format(unit_suffix) # output field name for fluxes # get top/botm elevations est_screen_top = None est_screen_botm = None if estimated_production_zone_top is not None and \ estimated_production_zone_botm is not None: surf_unit_conversion = convert_length_units(estimated_production_surface_units, model_length_units) est_screen_top = get_values_at_points(estimated_production_zone_top, ds_x, ds_y, points_crs=nc_crs) est_screen_top *= surf_unit_conversion est_screen_botm = get_values_at_points(estimated_production_zone_botm, ds_x, ds_y, points_crs=nc_crs) est_screen_botm *= surf_unit_conversion # in any places where screen top is less than the screen botm, # set both at the mean loc = est_screen_top < est_screen_botm means = np.mean([est_screen_top, est_screen_botm], axis=0) est_screen_top[loc] = means[loc] est_screen_botm[loc] = means[loc] print(f'Reset screen top and bottom to mean elevation at {loc.ravel().sum()} ' f'locations where screen top was < screen bottom') dfs = [] times = pd.DatetimeIndex(ds[time_variable].loc[start_date:end_date].values) for n, period_start_date in enumerate(times): # for each time entry, get the data kwargs = {time_variable: period_start_date} arr = ds[flux_variable].sel(**kwargs).values # make sure pumping sign is negative # based on assumption that values are mostly abstraction if arr.sum() > 0: arr *= -1 # set up a dataframe data = {'site_no': np.arange(ds_x.size), 'x': ds_x.ravel(), 'y': ds_y.ravel(), } if est_screen_top is not None and est_screen_botm is not None: data.update({'screen_top': est_screen_top.ravel(), 'screen_botm': est_screen_botm.ravel() } ) df = pd.DataFrame(data) df['start_datetime'] = period_start_date # get the end_date, handling last entry if n + 1 < len(times): period_end_date = times[n + 1] else: # set end date for last period on previous period length last_start = dfs[-1]['start_datetime'].values[0] ndays = (pd.Timestamp(period_start_date) - pd.Timestamp(last_start)).days period_end_date = period_start_date + pd.Timedelta(ndays, unit='d') # convert the time units ndays = (pd.Timestamp(period_end_date) - pd.Timestamp(period_start_date)).days assert ndays > 0, "period_end_date {} is before period_start_date {}"\ .format(period_end_date, period_start_date) time_conversion = 1 / ndays # original quantities are volumes for the time period # time indexing in pandas is through last value period_end_date = pd.Timestamp(period_end_date) - pd.Timedelta(1, unit='d') df['end_datetime'] = period_end_date df[flux_col] = arr.ravel() * length_conversion * time_conversion # only includes fluxes > 0 df = df.loc[df[flux_col] < 0] dfs.append(df) df = pd.concat(dfs) # site number column (that would be unique from other integers from other data sources) df['site_no'] = [f'iwum_{node}' for node in df.site_no] # project the data to a destination crs, if provided # make a separate metadata dataframe with 1 row per location # to avoid redundant operations metadata = df.groupby('site_no').first().reset_index()[['site_no', 'x', 'y']] metadata.index = metadata['site_no'] x_pr, y_pr = project((metadata.x.values, metadata.y.values), nc_crs, dest_crs) metadata['x'], metadata['y'] = x_pr, y_pr metadata['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)] # cull the data to the model area, if provided if active_area is not None: df, metadata = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs, metadata=metadata) # update data with x,y values projected in metadata x = dict(zip(metadata.site_no, metadata.x)) y = dict(zip(metadata.site_no, metadata.y)) df['x'] = [x[sn] for sn in df.site_no] df['y'] = [y[sn] for sn in df.site_no] if outfile is not None: outfile = Path(outfile) df.to_csv(outfile, index=False, float_format='%g') print('wrote {}'.format(outfile)) # Make a plot of iwum output in mgal/day out_pdf_path = outfile.parent / 'plots' out_pdf_path.mkdir(exist_ok=True) plot_iwum_output(ncfile, flux_variable=flux_variable, outpath=out_pdf_path) return df
def plot_iwum_output(ncfile, flux_variable='value', outpath='.'): """Make a plot of iwum output in mgal/day for comparison with subsequent datasets. """ ds = xr.open_dataset(ncfile) time_variable = [k for k in ds.coords.keys() if k.lower() not in {'x', 'y'}][0] xydims = tuple([i for i, len in enumerate(ds[flux_variable].shape) if len != ds[time_variable].shape[0]]) ts = ds[flux_variable][:, :, :].sum(axis=xydims).to_pandas() if ts.index.dtype == np.object: ts.index = pd.to_datetime(ts.index) ndays = pd.to_timedelta(np.diff(ts.index)).days.tolist() ndays.append(ndays[-1]) # pad the last time period df = pd.DataFrame(ts, columns=['m3']) df['m3d'] = df['m3'] / ndays # convert volumes to daily rate fig, ax = plt.subplots(figsize=(11, 8.5)) ax = df['m3d'].plot.bar(ax=ax) ax.set_ylabel('Cubic meters per day') ymin, ymax = ax.get_ylim() ax2 = ax.twinx() to_mg = convert_volume_units('m3', 'mgal') ax2.set_ylim(ymin * to_mg, ymax * to_mg) ax2.set_ylabel('Million gallons per day') # can't use .mean(), # because periods with 0 pumping may not be included mean_mgd = df['m3'].sum() * to_mg / np.sum(ndays) ax2.axhline(mean_mgd, c='r') ax2.text(0.75, 0.9, 'Mean: {:,.0f} mgal/day'.format(mean_mgd), transform=ax.transAxes) # format the tick labels format_xtick_labels(df, ax, maxlabels=30, date_format='%Y-%m-%d') #maxlabels = 30 #xticklabels = df.index.strftime('%Y-%m-%d').tolist() #stride = max(int(np.floor(len(xticklabels) / maxlabels)), 1) #formatted_labels = [] #for label in xticklabels[::stride]: # formatted_labels += [label] + [''] * (stride - 1) #formatted_labels = formatted_labels[:len(xticklabels)] #junk = ax.set_xticklabels(formatted_labels) # record the file name and last modified date ftime = pd.Timestamp(os.path.getmtime(ncfile), unit='s') ax2.text(0.02, 0.98, '{}\n{}'.format(ncfile, ftime.strftime('%Y-%m-%d')), va='top', fontsize=8, transform=ax.transAxes) # annotate the bars with the values for i, p in enumerate(ax.patches): value = '{:,.0f}'.format(to_mg * p.get_height()) ax.annotate(value, (p.get_x() * 1.01, p.get_height() * 1.01), ha='center', fontsize=8) ncfile = Path(ncfile) ftime = pd.Timestamp(ncfile.stat().st_mtime, unit='s') outfile = Path(outpath, f'{ncfile.name}_{ftime:%Y-%m-%d}.pdf') plt.savefig(outfile) print('wrote {}'.format(outfile)) plt.close()
def preprocess_te_wateruse(data, start_date=None, end_date=None, active_area=None, active_area_id_column=None, active_area_feature_id=None, estimated_production_zone_top=None, estimated_production_zone_botm=None, estimated_production_surface_units='feet', source_crs=4269, dest_crs=5070, interp_method='linear', data_volume_units='mgal', model_length_units='meters', outfile=None): """Preprocess water use data from thermoelectric power plants: * reproject data to a destination CRS `dest_crs`) * cull data to an area of interest (`active_area`) * if input data do not have information on the well screen intervals; sample screen tops and bottoms from raster surfaces bounding an estimated production zone (e.g. `estimated_production_zone_top`) * reindex the data to continous monthly values extending from `start_date` to `end_date`. Typically, these would bracket the time period for which the pumping should be simulated in a model. For example, the earliest data may be from 2010, but if the model starts in 2008, it may be appropriate to begin using the 2010 rates then (``start_date='2008'``). If no start or end date are given, the first and last years of pumping in `data` are used. * fill empty months by interpolation via a specified `interp_method` * backfill any remaining empty months going back to the `start_date` * write processed data to a CSV file and shapefile of the same name Parameters ---------- data : DataFrame Thermoelectric water use data in the following format (similar to that output by :func:`mapgwm.te_wateruse.read_te_water_use_spreadsheet`): =============== ======================================================= site_no power plant identifier (plant code) start_datetime pandas datetime representative of flux (e.g. '2010') x x-coordinate of withdrawl, in `source_crs` y y-coordinate of withdrawl, in `source_crs` q withdrawl flux, in `data_volume_units` per days =============== ======================================================= start_date : str Start date for pumping rates. If earlier than the dates in `data`, pumping rates will be backfilled to this date. end_date : str End date for pumping rates. If later than the dates in `data`, pumping rates will be forward filled to this date. active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. estimated_production_zone_top : file path Raster surface for assigning screen tops estimated_production_zone_botm : file path Raster surface for assigning screen bottoms estimated_production_surface_units : str, {'meters', 'ft', etc.} Length units of elevations in estimated production surface rasters. source_crs : obj Coordinate reference system of the head observation locations. A Python int, dict, str, or :class:`pyproj.crs.CRS` instance passed to :meth:`pyproj.crs.CRS.from_user_input` Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class By default, epsg:4269 dest_crs : obj Coordinate reference system of the model. Same input types as ``source_crs``. By default, epsg:5070 interp_method : str Interpolation method to use for filling pumping rates to monthly values. By default, 'linear' data_volume_units : str; e.g. 'mgal', 'm3', 'cubic feet', etc. Volume units of pumping data. All time units are assumed to be in days. model_length_units : str; e.g. 'feet', 'm', 'meters', etc. Length units of model. outfile : str Path for output file. A shapefile of the same name is also written. If None, no output file is written. By default, None Returns ------- df_monthly : DataFrame Notes ----- * time units for TE data and model are assumed to be days """ df = data.copy() # reproject to dest_crs x, y = project(zip(df['x'], df['y']), source_crs, dest_crs) df['x'], df['y'] = x, y df['geometry'] = [Point(x, y) for x, y in zip(x, y)] # drop wells with no location information (for now) df.dropna(subset=['x', 'y'], axis=0, inplace=True) # cull sites to those within the Delta footprint # cull data to that within the model area if active_area is not None: df = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs) # get top and bottom of estimated production interval at each well if estimated_production_zone_top is not None and \ estimated_production_zone_botm is not None: surf_unit_conversion = convert_length_units( estimated_production_surface_units, model_length_units) x, y = df.x.values, df.y.values est_screen_top = get_values_at_points(estimated_production_zone_top, x, y, points_crs=dest_crs) est_screen_top *= surf_unit_conversion est_screen_botm = get_values_at_points(estimated_production_zone_botm, x, y, points_crs=dest_crs) est_screen_botm *= surf_unit_conversion df['screen_top'] = est_screen_top df['screen_botm'] = est_screen_botm # distribute fluxes to monthly values # set start and end dates if not already set if start_date is None: start_date = df.start_datetime.min() if end_date is None: end_date = df.start_datetime.mmax() groups = df.groupby('site_no') all_groups = [] for site_no, group in groups: dfg = group.copy() # create a continuous monthly time index # labeled at the month start all_dates = pd.date_range(start_date, end_date, freq='MS') dfg.index = dfg['start_datetime'] dfg = dfg.reindex(all_dates) # interpolate the discharge values; # back filling to the start date dfg['q'] = dfg.q.interpolate(method=interp_method).bfill() dfg['q'] *= convert_volume_units(data_volume_units, model_length_units) # fill remaining columns dfg['start_datetime'] = dfg.index fill_columns = set(dfg.columns).difference({'q', 'start_datetime'}) fill_values = group.iloc[0].to_dict() for c in fill_columns: dfg[c] = fill_values[c] # add 'te' prefix to site number dfg['site_no'] = f'te_{site_no}' all_groups.append(dfg) df_monthly = pd.concat(all_groups) # assume most values represent abstraction # if sum is positive, invert so that output values are negative if df_monthly['q'].sum() > 0: df_monthly['q'] *= -1 # clean up the columns cols = [ 'site_no', 'start_datetime', 'x', 'y', 'screen_top', 'screen_botm', 'q', 'geometry' ] cols += list(set(df_monthly.columns).difference(cols)) df_monthly = df_monthly[cols] # write the output if outfile is not None: outfile = Path(outfile) df_monthly.drop('geometry', axis=1).to_csv(outfile, index=False, float_format='%g') print('wrote {}'.format(outfile)) # write only unique pumping values to shapefile to_shapefile = df_monthly.groupby(['site_no', 'q']).first().reset_index() shapefile = outfile.with_suffix('.shp') df2shp(to_shapefile, shapefile, crs=dest_crs) return df_monthly