def test_locate_sites(shellmound_sfrdata, reach_id_col, outdir): X, Y, rno = zip(*((515459.9, 1189906.1, 202), (515375.2, 1189942.5, 204))) df = pd.DataFrame({ 'geometry': [Point(x, y) for x, y in zip(X, Y)], 'site_no': rno }) sites_shapefile = '{}/sites.shp'.format(outdir) df2shp(df, sites_shapefile, crs=5070) sfrlines_shapefile = '{}/shellmound_lines.shp'.format(outdir) shellmound_sfrdata.export_lines(sfrlines_shapefile) # test reading sfrlines as a dataframe # and sfrlines without a reach number column if reach_id_col is None: reach_id_col = 'rno' sfrlines = gpd.read_file(sfrlines_shapefile) sfrlines.drop('rno', axis=1, inplace=True) sfrlines_shapefile = sfrlines active_area = box(*shellmound_sfrdata.grid.bounds) locs = locate_sites(sites_shapefile, sfrlines_shapefile, active_area, keep_columns=None, reach_id_col=reach_id_col, ireach_col='ireach', iseg_col='iseg', site_number_col='site_no', perimeter_buffer=1000, distance_threshold=1600) assert np.array_equal(locs.rno.values, locs.site_no.values) # check that iseg and ireach columns are in the located sites table # (for modflow-2005 style sfr packages) assert 'iseg' in locs.columns assert 'ireach' in locs.columns
def export_reach_data(reach_data, grid, filename, nodes=None, geomtype='Polygon'): """Generic method for exporting data to a shapefile; joins attributes in reach_data to geometries in grid using node numbers. """ assert grid is not None, "need grid attribute for export" if nodes is not None: keep = [True if n in nodes else False for n in reach_data.node] rd = reach_data.loc[keep].copy() else: rd = reach_data.copy() assert isinstance( grid, sfrmaker.grid.Grid), "grid needs to be an sfrmaker.Grid instance" assert np.array_equal(grid.df.node.values, np.arange(grid.size)) assert np.array_equal(grid.df.node.values, grid.df.index.values) polygons = grid.df.loc[rd.node, 'geometry'].values epsg = grid.crs.epsg proj_str = grid.crs.proj_str if geomtype.lower() == 'polygon': rd['geometry'] = polygons elif geomtype.lower() == 'point': rd['geometry'] = [p.centroid for p in polygons] else: raise ValueError('Unrecognized geomtype "{}"'.format(geomtype)) df2shp(rd, filename, epsg=epsg, proj_str=proj_str)
def active_area(outfolder): active_area_tuple = -90.55, 33.5, -90.16, 33.86 active_area_poly = box(*active_area_tuple) df = pd.DataFrame({'geometry': [active_area_poly], 'id': [0]}) active_area = os.path.join(outfolder, 'active_area.shp') df2shp(df, active_area, crs=4269) return active_area_tuple
def preprocessed_flowlines(test_data_path, culled_flowlines, outfolder, project_root_path): kwargs = culled_flowlines.copy() #kwargs['demfile'] = os.path.join(test_data_path, 'meras_100m_dem.tif') kwargs['demfile'] = os.path.join(test_data_path, 'meras_30m_dem.tif') #kwargs['demfile'] = os.path.join(project_root_path, 'examples/meras/dem_min_elevs_1000.tif') kwargs['dem_length_units'] = 'feet' kwargs['narwidth_shapefile'] = os.path.join(test_data_path, 'NARwidth.shp') kwargs['waterbody_shapefiles'] = os.path.join( test_data_path, 'NHDPlus08/NHDSnapshot/Hydrography/NHDWaterbody.shp') kwargs['asum_thresh'] = 20. kwargs['width_from_asum_a_param'] = 0.0592 kwargs['width_from_asum_b_param'] = 0.5127 kwargs['known_connections'] = { 17955195: 17955197, 17955197: 17955185, 17954979: 17954993, 17954993: 17955075 } kwargs['logger'] = None kwargs['output_length_units'] = 'meters' kwargs['outfolder'] = outfolder kwargs['project_epsg'] = 5070 preprocessed_flowlines = preprocess_nhdplus(**kwargs) # check that the known_connections were routed correctly for comid, tocomid in kwargs['known_connections'].items(): assert preprocessed_flowlines.loc[comid, 'tocomid'] == tocomid out_shapefile = os.path.join(outfolder, 'preprocessed_flowlines.shp') df2shp(preprocessed_flowlines, out_shapefile, crs=5070) return preprocessed_flowlines
def write_shapefile(self, filename='grid.shp'): i, j = np.indices((self.nrow, self.ncol)) df = pd.DataFrame({'node': list(range(len(self.polygons))), 'i': i.ravel(), 'j': j.ravel(), 'geometry': self.polygons }) df2shp(df, filename, epsg=self.epsg, proj_str=self.proj_str)
def write_bbox_shapefile(modelgrid, outshp): outline = get_grid_bounding_box(modelgrid) df2shp(pd.DataFrame({ 'desc': ['model bounding box'], 'geometry': [outline] }), outshp, epsg=modelgrid.epsg)
def extent_poly(): extent_poly_ll = box(-92.7, 46.7, -92.6, 46.8) extent_poly = project(extent_poly_ll, "+init=epsg:{}".format(4269), "+init=epsg:26915") df = pd.DataFrame({'geometry': [extent_poly], 'id': [0]}) df2shp(df, 'examples/data/bbox.shp', epsg=26915) return extent_poly_ll
def shapefile_features(polygon_features, test_output_path): df = pd.DataFrame({ 'id': list(range(len(polygon_features))), 'geometry': polygon_features }) shapefile_name = '{}/zstats_features.shp'.format(test_output_path) df2shp(df, shapefile_name, epsg=3070) return shapefile_name
def point_data(test_output_path): df = pd.DataFrame({ 'x': [1, 3, 5, 5, 3, 2], 'y': [1, 1, 1, 3, 2, 4], 'values': np.random.randn(6), }) df['geometry'] = [Point(x, y) for x, y in zip(df.x, df.y)] df2shp(df, test_output_path / 'test_points.shp', crs=5070)
def shellmound_active_area(shellmound_grid, outdir): """Make a shapefile of the shellmound bounding box.""" l, r, b, t, = shellmound_grid.extent bbox = box(l, b, r, t) df = pd.DataFrame({'geometry': [bbox], 'id': [0]}) out_shapefile = os.path.join(outdir, 'shellmound', 'shellmound_bbox.shp') gisutils.df2shp(df, out_shapefile, crs=5070) return out_shapefile
def write_active_area_shapefile(self, outshp='active_area.shp'): if self._active_area is None: self.create_active_area_polygon_from_isfr() assert isinstance(self._active_area, Polygon), \ "active area didn't get set correctly (not a shapely Polygon)" df = pd.DataFrame({'geometry': [self._active_area], 'description': ['Active area where SFR will be applied.']}) df2shp(df, outshp, crs=self.crs)
def write_shapefile(self, outshp='flowlines.shp'): """Write a shapefile of :py:attr:`Lines.df`. Parameters ---------- outshp : str, optional Shapefile name, by default 'flowlines.shp' """ df2shp(self.df, outshp, crs=self.crs)
def export_shapefile(filename, data, modelgrid, kper=None, squeeze=True, epsg=None, proj_str=None, prj=None, verbose=False): t0 = time.time() if isinstance(data, MFTransientList) or isinstance(data, MfList): df = mftransientlist_to_dataframe(data, squeeze=squeeze) elif isinstance(data, np.recarray): df = pd.DataFrame(data) elif isinstance(data, pd.DataFrame): df = data else: raise TypeError("data needs to be a pandas DataFrame, MFList, or numpy recarray") if epsg is None: epsg = modelgrid.epsg if proj_str is None: proj_str = modelgrid.proj_str if 'cellid' in df.columns and isinstance(df['cellid'].values[0], tuple): k, i, j = list(zip(*df['cellid'])) i = np.array(i) j = np.array(j) elif 'i' in df.columns and 'j' in df.columns: i, j = df['i'].values, df['j'].values elif 'geometry' not in df.columns: raise ValueError('DataFrame needs cellid, (i, j) or geometry' 'information to be exported to shapefile.') if kper is not None: df = df.loc[df.per == kper] verts = np.array(modelgrid.get_cell_vertices(i, j)) elif df is not None: verts = modelgrid.get_vertices(i, j) # use cell geometries from the model grid if 'geometry' not in df.columns: polys = np.array([Polygon(v) for v in verts]) df['geometry'] = polys # unfortunately, reaches through inactive cells # lose their cellid (k, i, j) location # so there is no way to plot these # without geometries from another source (such as the sfrlines) # drop such geometries, which are identified by k, i, j == -1 invalid_geoms = np.any(df[['k', 'i', 'j']] < 0, axis=1) df = df.loc[~invalid_geoms].copy() if epsg is None: epsg = modelgrid.epsg if proj_str is None: proj_str = modelgrid.proj_str if prj is None: prj = modelgrid.prj df2shp(df, filename, epsg=epsg, proj_str=proj_str, prj=prj) if verbose: print("shapefile export took {:.2f}s".format(time.time() - t0))
def export_lines(self, filename=None): """Export shapefile of linework""" if filename is None: filename = '{}_{}_cells.shp'.format(self.package_name, self.package_type) if self.package_type == 'sfr': data = self.reach_data else: data = self.stress_period_data assert 'geometry' in data.columns and \ isinstance(data.geometry.values[0], LineString), \ "No LineStrings in reach_data.geometry" df2shp(data, filename, crs=self.grid.crs)
def export_shapefile(filename, data, modelgrid, kper=None, squeeze=True, epsg=None, proj_str=None, prj=None, verbose=False): t0 = time.time() if isinstance(data, MFTransientList) or isinstance(data, MfList): df = mftransientlist_to_dataframe(data, squeeze=squeeze) elif isinstance(data, np.recarray): df = pd.DataFrame(data) elif isinstance(data, pd.DataFrame): df = data else: raise TypeError( "data needs to be a pandas DataFrame, MFList, or numpy recarray") if epsg is None: epsg = modelgrid.epsg if proj_str is None: proj_str = modelgrid.proj_str if 'cellid' in df.columns and isinstance(df['cellid'].values[0], tuple): k, i, j = list(zip(*df['cellid'])) i = np.array(i) j = np.array(j) elif 'i' in df.columns and 'j' in df.columns: i, j = df['i'].values, df['j'].values elif 'geometry' not in df.columns: raise ValueError('DataFrame needs cellid, (i, j) or geometry' 'information to be exported to shapefile.') if kper is not None: df = df.loc[df.per == kper] verts = np.array(modelgrid.get_cell_vertices(i, j)) elif df is not None: verts = modelgrid.get_vertices(i, j) if 'geometry' not in df.columns: polys = np.array([Polygon(v) for v in verts]) df['geometry'] = polys if epsg is None: epsg = modelgrid.epsg if proj_str is None: proj_str = modelgrid.proj_str if prj is None: prj = modelgrid.prj df2shp(df, filename, epsg=epsg, proj_str=proj_str, prj=prj) if verbose: print("shapefile export took {:.2f}s".format(time.time() - t0))
def test_locate_sites(shellmound_sfrdata, outdir): X, Y, rno = zip(*((515459.9, 1189906.1, 202), (515375.2, 1189942.5, 204))) df = pd.DataFrame({ 'geometry': [Point(x, y) for x, y in zip(X, Y)], 'site_no': rno }) sites_shapefile = '{}/sites.shp'.format(outdir) df2shp(df, sites_shapefile, epsg=5070) sfrlines_shapefile = '{}/shellmound_lines.shp'.format(outdir) shellmound_sfrdata.export_lines(sfrlines_shapefile) active_area = box(*shellmound_sfrdata.grid.bounds) locs = locate_sites(sites_shapefile, sfrlines_shapefile, active_area, keep_columns=None, reach_id_col='rno', site_number_col='site_no', perimeter_buffer=1000, distance_threshold=1600) assert locs.rno.equals(locs.site_no)
def write_shp(self, df, shpname='NWIS_export.shp', **kwargs): """Write a shapefile of points from NWIS site file Parameters ---------- df: dataframe dataframe of site info, must have dec_long_va and dec_lat_va columns with lon/lat in DD shpname: string Name for output shapefile Notes ----- NAD83 is assumed for dec_long_va and dec_lat_va. If some entries are in NAD27, a difference of ~5 to >15m will result for WI (see http://en.wikipedia.org/wiki/North_American_Datum#/media/File:Datum_Shift_Between_NAD27_and_NAD83.png) """ shpdf = df.copy() shpdf['geometry'] = [ Point(r.dec_long_va, r.dec_lat_va) for i, r in shpdf.iterrows() ] gisutils.df2shp(shpdf, shpname, epsg=4269)
def assign_layers_from_screen_top_botm(data, model, flux_col='q', screen_top_col='screen_top', screen_botm_col='screen_botm', label_col='site_no', across_layers=False, distribute_by='thickness', minimum_layer_thickness=2.): """Assign model layers to pumping flux data based on open interval. Fluxes are applied to each layer proportional to the fraction of open interval in that layer. Parameters ---------- data : dataframe of well info Must have i, j or x, y locations model : mfsetup.MF6model or mfsetup.MFnwtModel instance Must have dis, and optionally, attached MFsetupGrid instance flux_col : column in data with well fluxes screen_top_col : column in data with screen top elevations screen_botm_col : column in data with screen bottom elevations label_col : column with well names (optional; default site_no) across_layers : bool True to distribute fluxes to multipler layers intersected by open interval distribute_by : str ('thickness' or 'transmissivity') Distribute fluxes to layers based on thickness or transmissivity of intersected open intervals. Returns ------- data : dataframe of well info, modified so that each row represents pumping in a single model layer (with fluxes modified proportional to the amount of open interval in that layer). """ # inactive cells in either MODFLOW version if model.version == 'mf6': idomain = model.idomain else: idomain = model.bas6.ibound.array # 'boundname' column is used by wel setup for identifying wells if label_col in data.columns: data['boundname'] = data[label_col] if across_layers: raise NotImplemented('Distributing fluxes to multiple layers') else: if distribute_by == 'thickness': i, j, x, y, screen_botm, screen_top = None, None, None, None, None, None if 'i' in data.columns and 'y' in data.columns: i, j = data['i'].values, data['j'].values elif 'x' in data.columns and 'y' in data.columns: x, y = data['x'].values, data['y'].values if screen_top_col in data.columns: screen_top = data[screen_top_col].values if screen_botm_col in data.columns: screen_botm = data[screen_botm_col].values thicknesses = get_open_interval_thickness(model, i=i, j=j, x=x, y=y, screen_top=screen_top, screen_botm=screen_botm) # for each i, j location with a well, # get the layer with highest thickness in the open interval data['k'] = np.argmax(thicknesses, axis=0) # get the thickness for those layers all_layers = np.zeros((model.nlay + 1, model.nrow, model.ncol)) all_layers[0] = model.dis.top.array all_layers[1:] = model.dis.botm.array layer_thicknesses = -np.diff(all_layers[:, i, j], axis=0) # only include thicknesses for valid layers # set inactive cells to 0 thickness for the purpose or relocating wells layer_thicknesses[idomain[:, i, j] != 1] = 0 data['idomain'] = idomain[data['k'], i, j] data['laythick'] = layer_thicknesses[ data['k'].values, list(range(layer_thicknesses.shape[1]))] # flag layers that are too thin or inactive inactive = idomain[data.k, data.i, data.j] != 1 invalid_open_interval = (data['laythick'] < minimum_layer_thickness) | inactive if any(invalid_open_interval): outfile = model.cfg['wel']['output_files'][ 'dropped_wells_file'].format(model.name) # move wells that are still in a thin layer to the thickest active layer data['orig_layer'] = data['k'] thickest_layer = np.argmax(layer_thicknesses, axis=0) data.loc[invalid_open_interval, 'k'] = thickest_layer[invalid_open_interval] data['laythick'] = layer_thicknesses[ data['k'].values, list(range(layer_thicknesses.shape[1]))] data['idomain'] = idomain[data['k'], i, j] # record which wells were moved or dropped, and why bad_wells = data.loc[invalid_open_interval].copy() bad_wells['category'] = 'moved' bad_wells[ 'reason'] = 'longest open interval thickness < {} {} minimum'.format( minimum_layer_thickness, model.length_units) bad_wells[ 'routine'] = __name__ + '.assign_layers_from_screen_top_botm' msg = ( 'Warning: {} of {} wells in layers less than ' 'specified minimum thickness of {} {}\n' 'were moved to the thickest layer at their i, j locations.\n' .format(invalid_open_interval.sum(), len(data), minimum_layer_thickness, model.length_units)) still_below_minimum = bad_wells[ 'laythick'] < minimum_layer_thickness bad_wells.loc[still_below_minimum, 'category'] = 'dropped' bad_wells.loc[ still_below_minimum, 'reason'] = 'no layer above minimum thickness of {} {}'.format( minimum_layer_thickness, model.length_units) n_below = np.sum(still_below_minimum) if n_below > 0: msg += ( 'Out of these, {} of {} total wells remaining in layers less than ' 'specified minimum thickness of {} {}' ''.format(n_below, len(data), minimum_layer_thickness, model.length_units)) if flux_col in data.columns: pct_flux_below = 100 * bad_wells.loc[ still_below_minimum, flux_col].sum() / data[flux_col].sum() msg += ', \nrepresenting {:.2f} %% of total flux,'.format( pct_flux_below) msg += '\nwere dropped. See {} for details.'.format( outfile) print(msg) # write shapefile and CSV output for wells that were dropped cols = [ 'k', 'i', 'j', 'boundname', 'category', 'laythick', 'idomain', 'reason', 'routine', 'x', 'y' ] if flux_col in data.columns: cols.insert(3, flux_col) flux_below = bad_wells.groupby(['k', 'i', 'j' ]).first().reset_index()[cols] append_csv(outfile, flux_below, index=False, float_format='%g') if 'x' in flux_below.columns and 'y' in flux_below.columns: flux_below['geometry'] = [ Point(xi, yi) for xi, yi in zip(flux_below.x, flux_below.y) ] df2shp(flux_below, outfile[:-4] + '.shp', epsg=model.modelgrid.epsg) # cull the wells that are still below the min. layer thickness data = data.loc[ data['laythick'] > minimum_layer_thickness].copy() elif distribute_by == 'tranmissivity': raise NotImplemented( 'Distributing well fluxes by layer transmissivity') else: raise ValueError( 'Unrecognized argument for distribute_by: {}'.format( distribute_by)) return data
def write_grid_shapefile(self, outshp='grid.shp'): df2shp(self.df, outshp, crs=self.crs)
def assign_monthly_production(self, outfile='processed_swuds.csv'): """ Assign production wells for water use, skipping IR (irrigation) and TE (thermal electric) to production zones. If production zones are not assigned or if the well bottom doesn't fall into a production zone, then the screen_top and screen_bot are assigned using well_depth and the default screen length. Production is given in cubic m per day. todo: add unit conversion parameter so other units can be used? Parameters ---------- outfile: str path to final processed monthly water-use file with production zone information """ # fill in missing monthly values with annual value for c in self.monthly_cols: idx = self.df.loc[self.df[c].isnull()].index.values self.df.loc[idx, c] = self.df.loc[idx, 'ANNUAL_VAL'] # pull out groundwater sites that are not IR, AQ or TE self.df = self.df.loc[(self.df['WATER_CD'] == 'GW') & ~(self.df['FROM_NAT_WATER_USE_CD'] == 'IR') & ~(self.df['FROM_NAT_WATER_USE_CD'] == 'AQ') & ~(self.df['FROM_NAT_WATER_USE_CD'] == 'TE')] # reshape dataframe to have monthly values in same column stacked = pd.DataFrame(self.df[self.monthly_cols].stack()) stacked.reset_index(inplace=True) stacked.rename(columns={ 'level_1': 'month', 0: 'q_monthly' }, inplace=True) stacked.q_monthly = stacked.q_monthly stacked.index = stacked.level_0 stacked = stacked.join(self.df) keep_cols = [c for c in stacked.columns if c not in self.monthly_cols] stacked = stacked[keep_cols] month = {name: i + 1 for i, name in enumerate(self.monthly_cols)} dates = [ '{}-{:02d}'.format(year, month[month_column_name]) for year, month_column_name in zip(stacked.YEAR, stacked.month) ] stacked['datetime'] = pd.to_datetime(dates) stacked.sort_values(by=['SITE_NO', 'datetime'], inplace=True) # set start and end dates if not already set if self.start_date is None: self.start_date = stacked.datetime.min() if self.end_date is None: self.end_date = stacked.datetime.max() groups = stacked.groupby('SITE_NO') all_groups = [] for site_no, group in groups: group = group.copy() group.index = pd.to_datetime(group['datetime']) start_date = pd.Timestamp(self.start_date) end_date = pd.Timestamp(self.end_date) monthly_values_2010 = group.loc[group.datetime.dt.year == 2010] monthly_values_2010 = dict( zip(monthly_values_2010.datetime.dt.month, monthly_values_2010.q_monthly)) avg_monthly_values = group.groupby( group.index.month).mean().q_monthly.to_dict() q_mean = group.q_monthly.mean() # reindex the site data to include all months for simulation period all_dates = pd.date_range(start_date, end_date, freq='MS') group = group.reindex(all_dates) # fill empty dates q = [] for month, q_monthly in zip(group.index.month, group.q_monthly): # try to use 2010 values if they exist if np.isnan(q_monthly): q_monthly = monthly_values_2010.get(month, np.nan) # otherwise take the average value for each month if np.isnan(q_monthly): q_monthly = avg_monthly_values[month] # fill missing months with the mean value for the site if np.isnan(q_monthly): q_monthly = q_mean q.append(q_monthly) # assume most values represent abstraction # if sum is positive, invert so that output values are negative if np.sum(q) > 0: q = -np.array(q) group['q'] = q #group['q'] = group['q'] * 3785.4 # convert from mgd to cubic m per d group['q'] = group['q'] * convert_volume_units( self.data_volume_units, self.model_length_units) group['site_no'] = f'swuds_{site_no}' group['well_elev'] = self.well_elevations[site_no] group['depth'] = self.depths[site_no] well_botm_depth = self.well_elevations[site_no] - self.depths[ site_no] group['x'] = np.nanmin(group['x']) group['y'] = np.nanmin(group['y']) # assign a production zone from default dict. If the bottom of the # well does not fall in a zone, or if the dictionary is empty; then # the production zone is assigned 'unnamed' production_zone = 'unnamed' for prod_name in self.prod_zone_top.keys(): prod_zone_top = self.prod_zone_top[prod_name][site_no] prod_zone_bot = self.prod_zone_bot[prod_name][site_no] if np.isnan(prod_zone_top) or np.isnan( prod_zone_bot): # missing zone group['screen_bot'] = self.well_elevations[ site_no] - self.depths[site_no] group['screen_top'] = self.well_elevations[ site_no] - self.depths[ site_no] + self.default_screen_len group['open_int_method'] = 'well depth' else: if well_botm_depth < prod_zone_top and well_botm_depth > prod_zone_bot: production_zone = prod_name group['screen_bot'] = prod_zone_bot group['screen_top'] = prod_zone_top group['open_int_method'] = 'production zone' else: group['screen_bot'] = self.well_elevations[ site_no] - self.depths[site_no] group['screen_top'] = self.well_elevations[ site_no] - self.depths[ site_no] + self.default_screen_len group['open_int_method'] = 'well depth' group['production_zone'] = production_zone # add aquifer name group['aquifer_name'] = self.aquifer_names.get( group["FROM_AQFR_CD"].values[0], 'unnamed') cols = [ 'site_no', 'q', 'q_monthly', 'month', 'well_elev', 'depth', 'screen_bot', 'screen_top', 'x', 'y' ] all_groups.append(group[cols]) self.df = pd.concat(all_groups) self.df[ 'start_datetime'] = self.df.index # start date of each pumping period if outfile is not None: outfile = Path(outfile) self.df.to_csv(outfile, index=False) print( 'processed SWUDS data written to {0} and in dataframe attribute' .format(outfile)) self.df['geometry'] = [ Point(x, y) for x, y in zip(self.df.x, self.df.y) ] # write only unique pumping values to shapefile to_shapefile = self.df.groupby(['site_no', 'q']).first().reset_index() shapefile = outfile.with_suffix('.shp') df2shp(to_shapefile, shapefile, crs=self.dest_crs)
def write_grid_shapefile(self, outshp='grid.shp'): df2shp(self.df, outshp, epsg=self.crs.epsg, prj=self.crs.prjfile)
def preprocess_headobs( data, metadata, head_data_columns=['head', 'last_head', 'head_std'], dem=None, dem_units='meters', start_date='1998-04-01', active_area=None, active_area_id_column=None, active_area_feature_id=None, source_crs=4269, dest_crs=5070, data_length_units='meters', model_length_units='meters', geographic_groups=None, geographic_groups_col=None, max_obsname_len=None, outfile='../source_data/observations/head_obs/preprocessed_head_obs.csv' ): """Preprocess head observation data, for example, groundwater level data output from the `visGWDB program <https://doi.org/10.5066/P9W004O6>`_. * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates) to the CRS of the model (`dest_crs`) * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area * length units are converted to those of the groundwater model. Open intervals for the wells are converted from depths to elevations * missing open intervals are filled based on well bottom depths (if availabile) and the median open interval length for the dataset. * Wells are categorized based on the quality of the open interval information (see the documentation for :func:`mapgwm.headobs.fill_well_open_intervals`). * Prefixes for observation names (with an optional length limit) that identify the location are generated * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons (`aoi` parameter) Parameters ---------- data : DataFrame Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`. Columns: ========= ================================================================ site_no site identifier lat lattitude lon longitude datetime measurement dates in pandas datetime format head average head for the period represented by the datetime last_head last head measurement for the period represented by the datetime head_std standard deviation of measured heads within the datetime period ========= ================================================================ Notes: * lat and lon columns can alternatively be in the metadata table * `last_head` and `head_std` only need to be included if they are in `head_data_columns` metadata : DataFrame Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`. Must have the following columns: ================= ========================================================================== site_no (index) site identifier aqfr_cd Local aquifer code screen_botm Well screen bottom, as a depth below land surface, in feet screen_top Well screen top, as a depth below land surface, in feet well_depth Well depth, in feet well_el Altitude of land surface, in feet ================= ========================================================================== head_data_columns : list of strings Columns in data with head values or their statistics. By default, 'head', 'last_head', 'head_std', which allows both the average and last head values for the stress period to be considered, as well as the variability of water levels contributing to an average value. dem : str, optional DEM raster of the land surface. Used for estimating missing wellhead elevations. Any reprojection to dest_crs is handled automatically, assuming the DEM raster has CRS information embedded (arc-ascii grids do not!) By default, None. dem_units : str, {'feet', 'meters', ..} Units of DEM elevations, by default, 'meters' start_date : str (YYYY-mm-dd) Simulation start date (cull observations before this date) active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. source_crs : obj Coordinate reference system of the head observation locations. A Python int, dict, str, or :class:`pyproj.crs.CRS` instance passed to :meth:`pyproj.crs.CRS.from_user_input` Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class By default, epsg:4269 dest_crs : obj Coordinate reference system of the model. Same input types as ``source_crs``. By default, epsg:5070 data_length_units : str; 'meters', 'feet', etc. Length units of head observations. model_length_units : str; 'meters', 'feet', etc. Length units of model. geographic_groups : file, dict or list-like Option to group observations by area(s) of interest. Can be a shapefile, list of shapefiles, or dictionary of shapely polygons. A 'group' column will be created in the metadata, and observation sites within each polygon will be assigned the group name associated with that polygon. For example:: geographic_groups='../source_data/extents/CompositeHydrographArea.shp' geographic_groups=['../source_data/extents/CompositeHydrographArea.shp'] geographic_groups={'cha': <shapely Polygon>} Where 'cha' is an observation group name for observations located within the the area defined by CompositeHydrographArea.shp. For shapefiles, group names are provided in a `geographic_groups_col`. geographic_groups_col : str Field name in the `geographic_groups` shapefile(s) containing the observation group names associated with each polygon. max_obsname_len : int or None Maximum length for observation name prefix. Default of 13 allows for a PEST obsnme of 20 characters or less with <prefix>_yyyydd or <prefix>_<per>d<per> (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1) If None, observation names will not be truncated. PEST++ does not have a limit on observation name length. outfile : str Where output file will be written. Metadata are written to a file with the same name, with an additional "_info" suffix prior to the file extension. Returns ------- df : DataFrame Preprocessed time series well_info : DataFrame Preprocessed metadata References ---------- `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>` """ df = data.copy() # multiplier to convert input length units to model units unit_conversion = convert_length_units(data_length_units, model_length_units) # outputs out_plot = None if outfile is not None: outpath, filename = os.path.split(outfile) makedirs(outpath) outname, ext = os.path.splitext(outfile) out_info_csvfile = outname + '_info.csv' out_data_csvfile = outfile out_plot = os.path.join(outpath, 'open_interval_lengths.pdf') out_shapefile = outname + '_info.shp' # set the starting and ending dates here stdate = pd.Timestamp(start_date) # convert to datetime; drop the timestamps df['datetime'] = pd.to_datetime(df.datetime).dt.normalize() # trim to the time range n_measurements = len(data) n_sites = len(set(data.site_no)) print( f'starting with {n_measurements:,d} measurements at {n_sites:,d} unique wells' ) no_data_in_period = df.datetime < stdate if np.any(no_data_in_period): in_period = df.datetime >= stdate n_sites_before = len( set(df.loc[no_data_in_period, 'site_no']).difference(set(df.loc[in_period, 'site_no']))) print(( f'culling {in_period.sum():,d} measurements from {n_sites_before:,d} ' f'sites that are prior to start date of {start_date}')) df = df.loc[in_period] # collapse dataset to mean values at each site groups = df.groupby('site_no') well_info = groups.mean().copy() well_info = well_info.join(metadata, rsuffix='_meta') well_info['start_dt'] = groups.datetime.min() well_info['end_dt'] = groups.datetime.max() well_info.drop(labels=['year', 'month'], axis=1, inplace=True) well_info['site_no'] = well_info.index well_info['n'] = groups.datetime.count() # project x, y to model crs x_pr, y_pr = project((well_info.lon.values, well_info.lat.values), source_crs, dest_crs) well_info.drop(['lon', 'lat'], axis=1, inplace=True) well_info['x'], well_info['y'] = x_pr, y_pr well_info['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)] # cull data to that within the model area if active_area is not None: df, md = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs, metadata=well_info) # convert length units; convert screen tops and botms to depths missing_elevations = well_info.well_el.isna() if dem is not None and np.any(missing_elevations): well_location_elevations = get_values_at_points(dem, well_info['x'], well_info['y'], points_crs=dest_crs) well_location_elevations *= convert_length_units( dem_units, model_length_units) well_info.loc[missing_elevations, 'well_el'] = well_location_elevations[missing_elevations] length_columns = ['well_el' ] + head_data_columns + ['screen_top', 'screen_botm'] for col in length_columns: if col in well_info.columns: well_info[col] *= unit_conversion well_info['well_botm'] = well_info['well_el'] - well_info['well_depth'] well_info['screen_top'] = well_info['well_el'] - well_info['screen_top'] well_info['screen_botm'] = well_info['well_el'] - well_info['screen_botm'] # just the data, site numbers, times and aquifer head_data_columns = head_data_columns + ['head_std'] transient_cols = ['site_no', 'datetime'] + head_data_columns + ['n'] transient_cols = [c for c in transient_cols if c in df.columns] df = df[transient_cols].copy() for c in head_data_columns: if c in df.columns: df[c] *= unit_conversion # #### trim down to only well_info with both estimated water levels and standard deviation # monthly measured levels may not have standard deviation # (as opposed to monthly statistical estimates) criteria = pd.notnull(well_info['head']) #if 'head_std' in df.columns: # criteria = criteria & pd.notnull(well_info['head_std']) well_info = well_info[criteria] # verify that all well_info have a wellhead elevation assert not np.any(np.isnan(well_info.well_el)) # categorize wells based on quality of open interval information # estimate missing open intervals where possible well_info = fill_well_open_intervals(well_info, out_plot=out_plot) # drop well_info with negative reported open interval #well_info = well_info.loc[open_interval_length > 0] # cull data to well_info in well info table has_metadata = df.site_no.isin(well_info.index) if np.any(~has_metadata): warnings.warn('culling {} wells not found in metadata table!'.format( np.sum(~has_metadata))) df = df.loc[has_metadata].copy() # make unique n-character prefixes (site identifiers) for each observation location # 13 character length allows for prefix_yyyymmm in 20 character observation names # (BeoPEST limit) unique_obsnames = set() obsnames = [] for sn in well_info.index.tolist(): if max_obsname_len is not None: name = make_obsname(sn, unique_names=unique_obsnames, maxlen=max_obsname_len) assert name not in unique_obsnames else: name = sn unique_obsnames.add(name) obsnames.append(name) well_info['obsprefix'] = obsnames obsprefix = dict(zip(well_info.index, well_info.obsprefix)) df['obsprefix'] = [obsprefix[sn] for sn in df.site_no] # add area of interest information well_info['group'] = 'heads' well_info = assign_geographic_obsgroups(well_info, geographic_groups, geographic_groups_col, metadata_crs=dest_crs) # save out the results if outfile is not None: df2shp(well_info.drop(['x', 'y'], axis=1), out_shapefile, index=False, crs=dest_crs) print('writing {}'.format(out_info_csvfile)) well_info.drop('geometry', axis=1).to_csv(out_info_csvfile, index=False, float_format='%.2f') print('writing {}'.format(out_data_csvfile)) df.to_csv(out_data_csvfile, index=False, float_format='%.2f') return df, well_info
def write_shapefile(self, outshp='flowlines.shp'): df2shp(self.df, outshp, epsg=self.crs.epsg, prj=self.crs.prjfile)
def export_array_contours(filename, a, modelgrid, fieldname='level', interval=None, levels=None, maxlevels=1000, epsg=None, proj_str=None, verbose=False, **kwargs): """ Contour an array using matplotlib; write shapefile of contours. Parameters ---------- filename : str Path of output file with '.shp' extention. a : 2D numpy array Array to contour epsg : int EPSG code. See https://www.epsg-registry.org/ or spatialreference.org prj : str Existing projection file to be used with new shapefile. **kwargs : keyword arguments to matplotlib.axes.Axes.contour """ t0 = time.time() if epsg is None: epsg = modelgrid.epsg if proj_str is None: proj_str = modelgrid.proj_str if interval is not None: kwargs['levels'] = make_levels(a, interval, maxlevels) elif levels is not None: kwargs['levels'] = levels ax = plt.subplots()[-1] contours = ax.contour(modelgrid.xcellcenters, modelgrid.ycellcenters, a, **kwargs) plt.close() if not isinstance(contours, list): contours = [contours] if epsg is None: epsg = modelgrid.epsg if proj_str is None: proj_str = modelgrid.proj_str geoms = [] level = [] for ctr in contours: levels = ctr.levels for i, c in enumerate(ctr.collections): paths = c.get_paths() geoms += [ LineString(p.vertices) if len(p) > 1 else LineString() for p in paths ] level += list(np.ones(len(paths)) * levels[i]) # convert the dictionary to a recarray df = pd.DataFrame({'level': level, 'geometry': geoms}) df2shp(df, filename, epsg=epsg, proj_str=proj_str) if verbose: print("array contour export took {:.2f}s".format(time.time() - t0)) return
def preprocess_te_wateruse(data, start_date=None, end_date=None, active_area=None, active_area_id_column=None, active_area_feature_id=None, estimated_production_zone_top=None, estimated_production_zone_botm=None, estimated_production_surface_units='feet', source_crs=4269, dest_crs=5070, interp_method='linear', data_volume_units='mgal', model_length_units='meters', outfile=None): """Preprocess water use data from thermoelectric power plants: * reproject data to a destination CRS `dest_crs`) * cull data to an area of interest (`active_area`) * if input data do not have information on the well screen intervals; sample screen tops and bottoms from raster surfaces bounding an estimated production zone (e.g. `estimated_production_zone_top`) * reindex the data to continous monthly values extending from `start_date` to `end_date`. Typically, these would bracket the time period for which the pumping should be simulated in a model. For example, the earliest data may be from 2010, but if the model starts in 2008, it may be appropriate to begin using the 2010 rates then (``start_date='2008'``). If no start or end date are given, the first and last years of pumping in `data` are used. * fill empty months by interpolation via a specified `interp_method` * backfill any remaining empty months going back to the `start_date` * write processed data to a CSV file and shapefile of the same name Parameters ---------- data : DataFrame Thermoelectric water use data in the following format (similar to that output by :func:`mapgwm.te_wateruse.read_te_water_use_spreadsheet`): =============== ======================================================= site_no power plant identifier (plant code) start_datetime pandas datetime representative of flux (e.g. '2010') x x-coordinate of withdrawl, in `source_crs` y y-coordinate of withdrawl, in `source_crs` q withdrawl flux, in `data_volume_units` per days =============== ======================================================= start_date : str Start date for pumping rates. If earlier than the dates in `data`, pumping rates will be backfilled to this date. end_date : str End date for pumping rates. If later than the dates in `data`, pumping rates will be forward filled to this date. active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. estimated_production_zone_top : file path Raster surface for assigning screen tops estimated_production_zone_botm : file path Raster surface for assigning screen bottoms estimated_production_surface_units : str, {'meters', 'ft', etc.} Length units of elevations in estimated production surface rasters. source_crs : obj Coordinate reference system of the head observation locations. A Python int, dict, str, or :class:`pyproj.crs.CRS` instance passed to :meth:`pyproj.crs.CRS.from_user_input` Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class By default, epsg:4269 dest_crs : obj Coordinate reference system of the model. Same input types as ``source_crs``. By default, epsg:5070 interp_method : str Interpolation method to use for filling pumping rates to monthly values. By default, 'linear' data_volume_units : str; e.g. 'mgal', 'm3', 'cubic feet', etc. Volume units of pumping data. All time units are assumed to be in days. model_length_units : str; e.g. 'feet', 'm', 'meters', etc. Length units of model. outfile : str Path for output file. A shapefile of the same name is also written. If None, no output file is written. By default, None Returns ------- df_monthly : DataFrame Notes ----- * time units for TE data and model are assumed to be days """ df = data.copy() # reproject to dest_crs x, y = project(zip(df['x'], df['y']), source_crs, dest_crs) df['x'], df['y'] = x, y df['geometry'] = [Point(x, y) for x, y in zip(x, y)] # drop wells with no location information (for now) df.dropna(subset=['x', 'y'], axis=0, inplace=True) # cull sites to those within the Delta footprint # cull data to that within the model area if active_area is not None: df = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs) # get top and bottom of estimated production interval at each well if estimated_production_zone_top is not None and \ estimated_production_zone_botm is not None: surf_unit_conversion = convert_length_units( estimated_production_surface_units, model_length_units) x, y = df.x.values, df.y.values est_screen_top = get_values_at_points(estimated_production_zone_top, x, y, points_crs=dest_crs) est_screen_top *= surf_unit_conversion est_screen_botm = get_values_at_points(estimated_production_zone_botm, x, y, points_crs=dest_crs) est_screen_botm *= surf_unit_conversion df['screen_top'] = est_screen_top df['screen_botm'] = est_screen_botm # distribute fluxes to monthly values # set start and end dates if not already set if start_date is None: start_date = df.start_datetime.min() if end_date is None: end_date = df.start_datetime.mmax() groups = df.groupby('site_no') all_groups = [] for site_no, group in groups: dfg = group.copy() # create a continuous monthly time index # labeled at the month start all_dates = pd.date_range(start_date, end_date, freq='MS') dfg.index = dfg['start_datetime'] dfg = dfg.reindex(all_dates) # interpolate the discharge values; # back filling to the start date dfg['q'] = dfg.q.interpolate(method=interp_method).bfill() dfg['q'] *= convert_volume_units(data_volume_units, model_length_units) # fill remaining columns dfg['start_datetime'] = dfg.index fill_columns = set(dfg.columns).difference({'q', 'start_datetime'}) fill_values = group.iloc[0].to_dict() for c in fill_columns: dfg[c] = fill_values[c] # add 'te' prefix to site number dfg['site_no'] = f'te_{site_no}' all_groups.append(dfg) df_monthly = pd.concat(all_groups) # assume most values represent abstraction # if sum is positive, invert so that output values are negative if df_monthly['q'].sum() > 0: df_monthly['q'] *= -1 # clean up the columns cols = [ 'site_no', 'start_datetime', 'x', 'y', 'screen_top', 'screen_botm', 'q', 'geometry' ] cols += list(set(df_monthly.columns).difference(cols)) df_monthly = df_monthly[cols] # write the output if outfile is not None: outfile = Path(outfile) df_monthly.drop('geometry', axis=1).to_csv(outfile, index=False, float_format='%g') print('wrote {}'.format(outfile)) # write only unique pumping values to shapefile to_shapefile = df_monthly.groupby(['site_no', 'q']).first().reset_index() shapefile = outfile.with_suffix('.shp') df2shp(to_shapefile, shapefile, crs=dest_crs) return df_monthly
def preprocess_flows( data, metadata=None, flow_data_columns=['flow'], start_date=None, active_area=None, active_area_id_column=None, active_area_feature_id=None, source_crs=4269, dest_crs=5070, datetime_col='datetime', site_no_col='site_no', line_id_col='line_id', x_coord_col='x', y_coord_col='y', name_col='name', flow_qualifier_column=None, default_qualifier='measured', include_sites=None, include_line_ids=None, source_volume_units='ft3', source_time_units='s', dest_volume_units='m3', dest_time_units='d', geographic_groups=None, geographic_groups_col=None, max_obsname_len=None, add_leading_zeros_to_sw_site_nos=False, column_renames=None, outfile=None, ): """Preprocess stream flow observation data, for example, from NWIS or another data source that outputs time series in CSV format with site locations and identifiers. * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates) to the CRS of the model (`dest_crs`) * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area * length and time units are converted to those of the groundwater model. * Prefixes for observation names (with an optional length limit) that identify the location are generated * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons (`geographic_groups` parameter) Parameters ---------- data : csv file or DataFrame Time series of stream flow observations. Columns: ===================== ====================================== site_no site identifier datetime measurement dates/times x x-coordinate of site y y-coordinate of site flow_data_columns Columns of observed streamflow values flow_qualifier_column Optional column with qualifiers for flow values ===================== ====================================== Notes: * x and y columns can alternatively be in the metadata table * flow_data_columns are denoted in `flow_data_columns`; multiple columns can be included to process base flow and total flow, or other statistics in tandem * For example, `flow_qualifier_column` may have "estimated" or "measured" flags denoting whether streamflows were derived from measured values or statistical estimates. metadata : csv file or DataFrame Stream flow observation site information. May include columns: ================= ================================================================================ site_no site identifier x x-coordinate of site y y-coordinate of site name name of site line_id_col Identifier for a line in a hydrography dataset that the site is associated with. ================= ================================================================================ Notes: * other columns in metadata will be passed through to the metadata output flow_data_columns : list of strings Columns in data with flow values or their statistics. By default, ['q_cfs'] start_date : str (YYYY-mm-dd) Simulation start date (cull observations before this date) active_area : str Shapefile with polygon to cull observations to. Automatically reprojected to dest_crs if the shapefile includes a .prj file. by default, None. active_area_id_column : str, optional Column in active_area with feature ids. By default, None, in which case all features are used. active_area_feature_id : str, optional ID of feature to use for active area By default, None, in which case all features are used. source_crs : obj Coordinate reference system of the head observation locations. A Python int, dict, str, or :class:`pyproj.crs.CRS` instance passed to :meth:`pyproj.crs.CRS.from_user_input` Can be any of: - PROJ string - Dictionary of PROJ parameters - PROJ keyword arguments for parameters - JSON string with PROJ parameters - CRS WKT string - An authority string [i.e. 'epsg:4326'] - An EPSG integer code [i.e. 4326] - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')] - An object with a `to_wkt` method. - A :class:`pyproj.crs.CRS` class By default, epsg:4269 dest_crs : obj Coordinate reference system of the model. Same input types as ``source_crs``. By default, epsg:5070 datetime_col : str, optional Column name in data with observation date/times, by default 'datetime' site_no_col : str, optional Column name in data and metadata with site identifiers, by default 'site_no' line_id_col : str, optional Column name in data or metadata with identifiers for hydrography lines associated with observation sites. by default 'line_id' x_coord_col : str, optional Column name in data or metadata with x-coordinates, by default 'x' y_coord_col : str, optional Column name in data or metadata with y-coordinates, by default 'y' name_col : str, optional Column name in data or metadata with observation site names, by default 'name' flow_qualifier_column : str, optional Column name in data with flow observation qualifiers, such as "measured" or "estimated" by default 'category' default_qualifier : str, optional Default qualifier to populate flow_qualifier_column if it is None. By default, "measured" include_sites : list-like, optional Exclude output to these sites. by default, None (include all sites) include_line_ids : list-like, optional Exclude output to these sites, represented by line identifiers. by default, None (include all sites) source_volume_units : str, 'm3', 'cubic meters', 'ft3', etc. Volume units of the source data. By default, 'ft3' source_time_units : str, 's', 'seconds', 'days', etc. Time units of the source data. By default, 's' dest_volume_units : str, 'm3', 'cubic meters', 'ft3', etc. Volume units of the output (model). By default, 'm3' dest_time_units : str, 's', 'seconds', 'days', etc. Time units of the output (model). By default, 'd' geographic_groups : file, dict or list-like Option to group observations by area(s) of interest. Can be a shapefile, list of shapefiles, or dictionary of shapely polygons. A 'group' column will be created in the metadata, and observation sites within each polygon will be assigned the group name associated with that polygon. For example:: geographic_groups='../source_data/extents/CompositeHydrographArea.shp' geographic_groups=['../source_data/extents/CompositeHydrographArea.shp'] geographic_groups={'cha': <shapely Polygon>} Where 'cha' is an observation group name for observations located within the the area defined by CompositeHydrographArea.shp. For shapefiles, group names are provided in a `geographic_groups_col`. geographic_groups_col : str Field name in the `geographic_groups` shapefile(s) containing the observation group names associated with each polygon. max_obsname_len : int or None Maximum length for observation name prefix. Default of 13 allows for a PEST obsnme of 20 characters or less with <prefix>_yyyydd or <prefix>_<per>d<per> (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1) If None, observation names will not be truncated. PEST++ does not have a limit on observation name length. add_leading_zeros_to_sw_site_nos : bool Whether or not to pad site numbers using the :func:~`mapgwm.swflows.format_usgs_sw_site_id` function. By default, False. column_renames : dict, optional Option to rename columns in the data or metadata that are different than those listed above. For example, if the data file has a 'SITE_NO' column instead of 'SITE_BADGE':: column_renames={'SITE_NO': 'site_no'} by default None, in which case the renames listed above will be used. Note that the renames must be the same as those listed above for :func:`mapgwm.swflows.preprocess_flows` to work. outfile : str Where output file will be written. Metadata are written to a file with the same name, with an additional "_info" suffix prior to the file extension. Returns ------- data : DataFrame Preprocessed time series metadata : DataFrame Preprocessed metadata References ---------- `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>` Notes ----- """ # outputs if outfile is not None: outpath, filename = os.path.split(outfile) makedirs(outpath) outname, ext = os.path.splitext(outfile) out_info_csvfile = outname + '_info.csv' out_data_csvfile = outfile out_shapefile = outname + '_info.shp' # read the source data if not isinstance(data, pd.DataFrame): df = pd.read_csv(data, dtype={site_no_col: object}) else: df = data.copy() # check the columns for col in [datetime_col] + flow_data_columns: assert col in df.columns, "Column {} not found in {}".format(col, data) assert any({site_no_col, line_id_col}.intersection(df.columns)), \ "Neither {} or {} found in {}. Need to specify a site_no_col or line_id_col".format(site_no_col, line_id_col, data) # rename input columns to these names, # for consistent output dest_columns = { datetime_col: 'datetime', site_no_col: 'site_no', line_id_col: 'line_id', x_coord_col: 'x', y_coord_col: 'y', name_col: 'name', flow_qualifier_column: 'category' } # update the default column renames # with any supplied via column_renames parameter if isinstance(column_renames, collections.Mapping): dest_columns.update(column_renames) df.rename(columns=dest_columns, inplace=True) flow_data_columns = [ c if c not in dest_columns else dest_columns[c] for c in flow_data_columns ] # convert site numbers to strings; # add leading 0s to any USGS sites that should have them if 'site_no' in df.columns: df['site_no'] = format_site_ids(df['site_no'], add_leading_zeros_to_sw_site_nos) else: df['site_no'] = df[line_id_col] # read the source data if metadata is not None: if not isinstance(metadata, pd.DataFrame): md = pd.read_csv(metadata, dtype={site_no_col: object}) else: md = metadata.copy() if site_no_col not in md.columns or 'site_no' not in df.columns: raise IndexError( 'If metadata are supplied, both data and metadata must ' 'have a site_no column.') md.rename(columns=dest_columns, inplace=True) md['site_no'] = format_site_ids(md['site_no'], add_leading_zeros_to_sw_site_nos) md.index = md['site_no'] by_site = df.groupby('site_no') md['start_dt'] = pd.DataFrame(by_site['datetime'].first()) else: by_site = df.groupby('site_no') md = pd.DataFrame(by_site['datetime'].first()) md.columns = ['start_dt'] md['site_no'] = md.index md['end_dt'] = pd.DataFrame(by_site['datetime'].last()) md['n'] = pd.DataFrame(by_site['datetime'].count()) md.reset_index(inplace=True, drop=True) # assign metadata if supplied for col in 'x', 'y', 'line_id', 'name': if col in df.columns and col not in md.columns: by_site_no = dict(zip(df['site_no'], df[col])) md[col] = [by_site_no[sn] for sn in md['site_no']] if col != 'line_id': df.drop(col, axis=1, inplace=True) # index the dataframe to times; # truncate data before start date df.index = pd.to_datetime(df['datetime']) df.index.name = 'datetime' df = df.loc[start_date:].copy() # project x, y to model crs x_pr, y_pr = project((md.x.values, md.y.values), source_crs, dest_crs) md['x'], md['y'] = x_pr, y_pr md['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)] # cull data to that within the model area if active_area is not None: df, md = cull_data_to_active_area(df, active_area, active_area_id_column, active_area_feature_id, data_crs=dest_crs, metadata=md) # get the hydrography IDs corresponding to each site # using the included lookup table #if 'line_id' not in df.columns: # assert line_id_lookup is not None, \ # "need to include line_ids in a column, or line_id_lookup dictionary mapping line_ids to site numbers" # df = df.loc[df['site_no'].isin(line_id_lookup)].copy() # df['line_id'] = [line_id_lookup[sn] for sn in df['site_no']] if include_sites is not None: md = md.loc[md.site_no.isin(include_sites)] df = df.loc[df.site_no.isin(include_sites)] if include_line_ids is not None: md = md.loc[md.line_id.isin(include_line_ids)] df = df.loc[df.line_id.isin(include_line_ids)] # convert units # ensure that flow values are numeric (may be objects if taken directly from NWIS) unit_conversion = ( convert_volume_units(source_volume_units, dest_volume_units) / convert_time_units(source_time_units, dest_time_units)) for flow_col in flow_data_columns: df[flow_col] = pd.to_numeric(df[flow_col], errors='coerce') * unit_conversion df.dropna(subset=flow_data_columns, axis=0, inplace=True) # reformat qualifiers for consistent output # (lump to dest category columns of either estimated or measured) # with measured including values derived from baseflow separation or actual measurements) # output column name for flow qualifier column: dest_flow_qualifier_column = 'category' if flow_qualifier_column is not None: flow_qualifiers = { 'calculated': 'measured', # 'measured', 'base flow separated from measured values': 'measured', # 'measured', 'measured total flow': 'measured', 'estimated gaged': 'estimated', 'estimated ungaged': 'estimated' } df[dest_flow_qualifier_column] = df[flow_qualifier_column].replace( flow_qualifiers) else: df['category'] = default_qualifier # make unique n-character prefixes (site identifiers) for each observation location # 13 character length allows for prefix_yyyymmm in 20 character observation names # (BeoPEST limit) unique_obsnames = set() obsnames = [] for sn in md['site_no'].tolist(): if max_obsname_len is not None: name = make_obsname(sn, unique_names=unique_obsnames, maxlen=max_obsname_len) assert name not in unique_obsnames else: name = sn unique_obsnames.add(name) obsnames.append(name) md['obsprefix'] = obsnames # add area of interest information md['group'] = 'fluxes' md = assign_geographic_obsgroups(md, geographic_groups, geographic_groups_col, metadata_crs=dest_crs) # data columns data_cols = ['site_no', 'line_id', 'datetime' ] + flow_data_columns + ['category'] #if 'line_id' in md.columns and 'line_id' not in df.columns: # # only map line_ids to data if there are more site numbers # # implying that no site number maps to more than one line_id # if len(set(df.site_no)) >= len(set(df.line_id)): # ids = dict(zip(md['site_no'], md['line_id'])) # df['line_id'] = [ids[sn] for sn in df['site_no']] data_cols = [c for c in data_cols if c in df.columns] df = df[data_cols] md.index = md['site_no'] # save out the results if outfile is not None: df2shp(md.drop(['x', 'y'], axis=1), out_shapefile, crs=dest_crs) print('writing {}'.format(out_info_csvfile)) md.drop('geometry', axis=1).to_csv(out_info_csvfile, index=False, float_format='%g') print('writing {}'.format(out_data_csvfile)) df.to_csv(out_data_csvfile, index=False, float_format='%g') return df, md
def assign_layers_from_screen_top_botm(data, model, flux_col='q', screen_top_col='screen_top', screen_botm_col='screen_botm', label_col='site_no', across_layers=False, distribute_by='transmissivity', minimum_layer_thickness=2.): """Assign model layers to pumping flux data based on open interval. Fluxes are applied to each layer proportional to the fraction of open interval in that layer. Parameters ---------- data : dataframe of well info Must have i, j or x, y locations model : mfsetup.MF6model or mfsetup.MFnwtModel instance Must have dis, and optionally, attached MFsetupGrid instance flux_col : column in data with well fluxes screen_top_col : column in data with screen top elevations screen_botm_col : column in data with screen bottom elevations label_col : column with well names (optional; default site_no) across_layers : bool True to distribute fluxes to multipler layers intersected by open interval distribute_by : str ('thickness' or 'transmissivity') Distribute fluxes to layers based on thickness or transmissivity of intersected open intervals. Returns ------- data : dataframe of well info, modified so that each row represents pumping in a single model layer (with fluxes modified proportional to the amount of open interval in that layer). """ # inactive cells in either MODFLOW version if model.version == 'mf6': idomain = model.idomain else: idomain = model.bas6.ibound.array # 'boundname' column is used by wel setup for identifying wells if label_col in data.columns: data['boundname'] = data[label_col] if across_layers: raise NotImplemented('Distributing fluxes to multiple layers') else: if distribute_by in {'thickness', 'transmissivity'}: i, j, x, y, screen_botm, screen_top = None, None, None, None, None, None if 'i' in data.columns and 'j' in data.columns: i, j = data['i'].values, data['j'].values elif 'x' in data.columns and 'y' in data.columns: raise NotImplementedError( 'Assigning well layers with just x, y') x, y = data['x'].values, data['y'].values if screen_top_col in data.columns: screen_top = data[screen_top_col].values if screen_botm_col in data.columns: screen_botm = data[screen_botm_col].values # get starting heads if available no_strt_msg = ( f'Well setup: distribute_by: {distribute_by} selected ' 'but model has no {} package for computing sat. ' 'thickness.\nUsing full layer thickness.') strt3D = None if model.version == 'mf6': strt_package = 'IC' else: strt_package = 'BAS6' if strt_package not in model.get_package_list(): warnings.warn(no_strt_msg.format(strt_package), UserWarning) strt2D = None strt3D = None else: strt = getattr(getattr(model, strt_package.lower()), 'strt') strt3D = strt.array strt2D = strt3D[:, i, j] thicknesses = get_open_interval_thickness(model, heads=strt2D, i=i, j=j, x=x, y=y, screen_top=screen_top, screen_botm=screen_botm) hk = np.ones_like(thicknesses) if distribute_by == 'transmissivity': no_k_msg = ( 'Well setup: distribute_by: transmissivity selected ' 'but model has no {} package.\nFalling back to' 'distributing wells by layer thickness.') if model.version == 'mf6': hk_package = 'NPF' hk_var = 'k' elif model.version == 'mfnwt': hk_package = 'UPW' hk_var = 'hk' else: hk_package = 'LPF' hk_var = 'hk' if hk_package not in model.get_package_list(): warnings.warn(no_k_msg.format(hk_package), UserWarning) hk = np.ones_like(thicknesses) else: hk = getattr(getattr(model, hk_package.lower()), hk_var) hk = hk.array[:, i, j] # for each i, j location with a well, # get the layer with highest transmissivity in the open interval # if distribute_by == 'thickness' or no hk array, # T == thicknesses # round to avoid erratic floating point behavior # for (nearly) equal quantities T = np.round(thicknesses * hk, 2) # to get the deepest occurance of a max value # (argmax will result in the first, or shallowest) # take the argmax on the reversed view of the array # data['k'] = np.argmax(T, axis=0) T_r = T[::-1] data['k'] = len(T_r) - np.argmax(T_r, axis=0) - 1 # get thicknesses for all layers # (including portions of layers outside open interval) all_layers = np.zeros((model.nlay + 1, model.nrow, model.ncol)) all_layers[0] = model.dis.top.array all_layers[1:] = model.dis.botm.array all_layer_thicknesses = np.abs(np.diff(all_layers, axis=0)) layer_thicknesses = -np.diff(all_layers[:, i, j], axis=0) # only include thicknesses for valid layers # reset thicknesses to sat. thickness if strt3D is not None: sat_thickness = strt3D - model.dis.botm.array # cells where the head is above the layer top no_unsat = sat_thickness > all_layer_thicknesses sat_thickness[no_unsat] = all_layer_thicknesses[no_unsat] # cells where the head is below the cell bottom sat_thickness[sat_thickness < 0] = 0 layer_thicknesses = sat_thickness[:, i, j] # set inactive cells to 0 thickness for the purpose or relocating wells layer_thicknesses[idomain[:, i, j] < 1] = 0 data['idomain'] = idomain[data['k'], i, j] data['laythick'] = layer_thicknesses[ data['k'].values, list(range(layer_thicknesses.shape[1]))] # flag layers that are too thin or inactive inactive = idomain[data.k, data.i, data.j] < 1 invalid_open_interval = (data['laythick'] < minimum_layer_thickness) | inactive if any(invalid_open_interval): outfile = model.cfg['wel']['output_files'][ 'dropped_wells_file'].format(model.name) # move wells that are still in a thin layer to the thickest active layer data['orig_layer'] = data['k'] # get T for all layers T_all_layers = np.round(layer_thicknesses * hk, 2) # to get the deepest occurance of a max value # (argmax will result in the first, or shallowest) # take the argmax on the reversed view of the array # Tmax_layer = np.argmax(T_all_layers, axis=0) T_all_layers_r = T_all_layers[::-1] Tmax_layer = len(T_all_layers_r) - np.argmax(T_all_layers_r, axis=0) - 1 data.loc[invalid_open_interval, 'k'] = Tmax_layer[invalid_open_interval] data['laythick'] = layer_thicknesses[ data['k'].values, list(range(layer_thicknesses.shape[1]))] data['idomain'] = idomain[data['k'], i, j] # record which wells were moved or dropped, and why bad_wells = data.loc[invalid_open_interval].copy() bad_wells['category'] = 'moved' bad_wells[ 'reason'] = 'longest open interval thickness < {} {} minimum'.format( minimum_layer_thickness, model.length_units) bad_wells[ 'routine'] = __name__ + '.assign_layers_from_screen_top_botm' msg = ( 'Warning: {} of {} wells in layers less than ' 'specified minimum thickness of {} {}\n' 'were moved to the thickest layer at their i, j locations.\n' .format(invalid_open_interval.sum(), len(data), minimum_layer_thickness, model.length_units)) still_below_minimum = bad_wells[ 'laythick'] < minimum_layer_thickness bad_wells.loc[still_below_minimum, 'category'] = 'dropped' bad_wells.loc[ still_below_minimum, 'reason'] = 'no layer above minimum thickness of {} {}'.format( minimum_layer_thickness, model.length_units) n_below = np.sum(still_below_minimum) if n_below > 0: msg += ( 'Out of these, {} of {} total wells remaining in layers less than ' 'specified minimum thickness of {} {}' ''.format(n_below, len(data), minimum_layer_thickness, model.length_units)) if flux_col in data.columns: pct_flux_below = 100 * bad_wells.loc[ still_below_minimum, flux_col].sum() / data[flux_col].sum() msg += ', \nrepresenting {:.2f} %% of total flux,'.format( pct_flux_below) msg += '\nwere dropped. See {} for details.'.format( outfile) print(msg) # write shapefile and CSV output for wells that were dropped cols = [ 'k', 'i', 'j', 'boundname', 'category', 'laythick', 'idomain', 'reason', 'routine', 'x', 'y' ] if flux_col in data.columns: cols.insert(3, flux_col) flux_below = bad_wells.groupby(['k', 'i', 'j' ]).first().reset_index()[cols] append_csv(outfile, flux_below, index=False, float_format='%g') if 'x' in flux_below.columns and 'y' in flux_below.columns: flux_below['geometry'] = [ Point(xi, yi) for xi, yi in zip(flux_below.x, flux_below.y) ] df2shp(flux_below, outfile[:-4] + '.shp', epsg=model.modelgrid.epsg) # cull the wells that are still below the min. layer thickness data = data.loc[ data['laythick'] > minimum_layer_thickness].copy() else: raise ValueError( 'Unrecognized argument for distribute_by: {}'.format( distribute_by)) return data