コード例 #1
0
def rasters_to_grid(modelgrid, dem, rasters,
                    dem_elevation_units='meters', raster_elevation_units='meters',
                    dest_elevation_units='meters'):
    """Sample a sequence of rasters onto the i, j locations of a modelgrid,
    returning a 3D numpy array of the sampled elevations. Fill places with nodata
    using the next valid surface above.

    Parameters
    ----------
    modelgrid : Modflow-setup :class:`~mfsetup.grid.MFsetupGrid` instance
        Modflow-setup grid instance describing the model grid
    dem : str (filepath)
        Raster representing the land surface, at the highest resolution being contemplated for the model.
        Usually this is derived by sampling a higher resolution DEM using zonal statistics, taking
        the mean DEM value for each model cell.
    rasters : list of strings (filepaths)
        Raster surfaces describing hydrogelogic contacts surrounding the voxel data.
    dem_elevation_units : str, optional
        Elevation units of dem_means_raster, by default 'meters'
    framework_raster_elevation_units : str, optional
        Elevation units of the framework_rasters, by default 'meters'
    model_length_units : str, optional
        Length units used in the model, by default 'meters'

    References
    ----------
    See the documentation for the :func:`fill_cells_vertically <mfsetup.discretization.fill_cells_vertically>`
    function in Modflow-setup for an explanation of the filling process.

    """
    grid = modelgrid
    dem_elevations = get_values_at_points(dem, grid.xcellcenters, grid.ycellcenters,
                                          method='linear')
    # convert to model units
    dem_elevations *= convert_length_units(dem_elevation_units, dest_elevation_units)

    raster_elevations = []
    for raster in rasters:
        grid_cell_values = get_values_at_points(raster, grid.xcellcenters, grid.ycellcenters,
                                                method='linear')
        # convert to model units
        grid_cell_values *= convert_length_units(raster_elevation_units, dest_elevation_units)
        raster_elevations.append(grid_cell_values)
    raster_elevations = np.array(raster_elevations)
    
    # fill nans in the sampled original framework elevations
    # (nans are where a layer surface is absent)
    # fill the nans with the next surface above
    # see https://github.com/aleaf/modflow-setup/blob/develop/mfsetup/discretization.py
    model_top_filled, filled_raster_elevations = fill_cells_vertically(dem_elevations, raster_elevations)
    above_land_surface = filled_raster_elevations > dem_elevations
    # reset any values above land surface to land surface
    dem_means_3d = np.tile(dem_elevations, (filled_raster_elevations.shape[0], 1, 1))
    filled_raster_elevations[above_land_surface] = dem_means_3d[above_land_surface]
    del dem_means_3d
    filled_raster_elevations = np.vstack([np.reshape(dem_elevations, (1, *dem_elevations.shape)),
                                          filled_raster_elevations])
    return filled_raster_elevations
コード例 #2
0
    def assign_missing_elevs(self, top_raster, dem_units='meters'):
        """ Use the top of model raster, or land-surface raster,
        to assign the elevation for points where elevation is missing.
        
        Parameters
        ----------
        top_raster: str
            path to raster data set with land surface or model top elevation, used
            to assign missing values to water-use points
        elev_field: str
            field in df with elevation data, default is 'FROM_ALT_VA'
        """

        no_elev = self.df['FROM_ALT_VA'].isnull()
        x_no_elev = self.df.loc[no_elev, 'x'].values
        y_no_elev = self.df.loc[no_elev, 'y'].values
        elevs = raster.get_values_at_points(top_raster,
                                            x=x_no_elev,
                                            y=y_no_elev,
                                            points_crs=self.dest_crs)
        elevs *= convert_length_units(dem_units, self.model_length_units)
        self.df.loc[no_elev, 'FROM_ALT_VA'] = elevs
        assert not self.df['FROM_ALT_VA'].isnull().any()
        self.well_elevations = dict(
            zip(self.df['SITE_NO'], self.df['FROM_ALT_VA']))
コード例 #3
0
ファイル: evaporation.py プロジェクト: ntdosch/modflow-setup
def hamon_evaporation(day_of_year, tmean_c, latitude_dd,
                      dest_length_units='inches'):
    """

    Parameters
    ----------
    day_of_year : int
        (Julian) day of the year
    tmean_c : float
        Average daily air temperature, in Celsius
    latitude_dd : float
        Latitude, decimal degrees
    dest_length_units : str
        Length units of output (e.g. ft., feet, meters, etc.)

    Returns
    -------
    E : float
        Open water evaporation, in inches per day
    """
    delta = solar_declination(day_of_year)
    omega = sunset_hour_angle(latitude_dd, delta)
    D = max_daylight_hours(omega)
    svp = saturation_vapor_pressure(tmean_c)
    svd = saturation_vapor_density(svp,
                                   tmean_c)
    E_inches = 0.55 * (D/12)**2 * (svd/100)
    mult = convert_length_units('inches', dest_length_units)
    return E_inches * mult
コード例 #4
0
def test_rch_setup(pfl_nwt_with_dis, project_root_path, simulate_high_k_lakes):

    m = pfl_nwt_with_dis  #deepcopy(pfl_nwt_with_dis)
    m.cfg['high_k_lakes']['simulate_high_k_lakes'] = simulate_high_k_lakes
    # test intermediate array creation from rech specified as scalars
    m.cfg['rch']['rech'] = [0.001, 0.002]
    m.cfg['rch']['rech_length_units'] = 'meters'
    m.cfg['rch']['rech_time_units'] = 'days'
    rch = m.setup_rch()
    arrayfiles = m.cfg['intermediate_data']['rech']
    assert len(arrayfiles) == len(m.cfg['rch']['rech'])
    for f in arrayfiles:
        assert os.path.exists(f)

    # test intermediate array creation from source_data
    # (rasters of different shapes)
    inf_array = 'mfsetup/tests/data/plainfieldlakes/source_data/' \
                'net_infiltration__2012-01-01_to_2017-12-31__1066_by_1145__SUM__INCHES_PER_YEAR.tif'
    inf_array = os.path.join(project_root_path, inf_array)
    with rasterio.open(inf_array) as src:
        inf_values = src.read(1)

    m.cfg['rch']['source_data']['rech']['filename'] = inf_array
    m.cfg['rch']['rech'] = None
    m.cfg['rch']['source_data']['rech']['length_units'] = 'inches'
    m.cfg['rch']['source_data']['rech']['time_units'] = 'years'
    rch = m.setup_rch()

    # spatial mean recharge in model should approx. match the GeoTiff (which covers a larger area)
    avg_in_yr = rch.rech.array[0, 0, :, :].mean() * convert_length_units('meters', 'inches') * \
        convert_time_units('days', 'years')
    assert np.allclose(avg_in_yr, inf_values.mean() * m.cfg['rch']['source_data']['rech']['mult'], rtol=0.25)
    arrayfiles = m.cfg['intermediate_data']['rech']
    for f in arrayfiles:
        assert os.path.exists(f)

    # check that high-K lake recharge was assigned correctly
    if simulate_high_k_lakes:
        highklake_recharge = m.rch.rech.array[:, 0, m.isbc[0] == 2].mean(axis=1)
        print(highklake_recharge)
        print(m.high_k_lake_recharge)
        assert np.allclose(highklake_recharge, m.high_k_lake_recharge)
    else:
        assert not np.any(m._isbc2d == 2)

    # test writing of MODFLOW arrays
    rch.write_file()
    assert m.cfg['rch']['rech'] is not None
    for f in m.cfg['rch']['rech']:
        assert os.path.exists(f)
    assert os.path.exists(rch.fn_path)

    # test intermediate array creation from rech specified as arrays
    # (of same shape; use MODFLOW arrays written above)
    rch = m.setup_rch()
    arrayfiles = m.cfg['intermediate_data']['rech']
    for f in arrayfiles:
        assert os.path.exists(f)
コード例 #5
0
def test_rch_setup(shellmound_model_with_dis):
    m = shellmound_model_with_dis  # deepcopy(model)
    rch = m.setup_rch()
    rch.write()
    # check for irch file
    irchfile = os.path.join(m.model_ws, m.cfg['rch']['irch'][0]['filename'])
    assert os.path.exists(irchfile)
    irch = load_array(
        os.path.join(m.model_ws, m.cfg['rch']['irch'][0]['filename']))
    assert irch.shape[0] == m.nrow
    assert irch.shape[1] == m.ncol

    assert os.path.exists(os.path.join(m.model_ws, rch.filename))
    assert isinstance(rch, mf6.ModflowGwfrcha)
    assert rch.recharge is not None
    # get the same data from the source file
    ds = xr.open_dataset(m.cfg['rch']['source_data']['recharge']['filename'])
    x = xr.DataArray(m.modelgrid.xcellcenters.ravel(), dims='z')
    y = xr.DataArray(m.modelgrid.ycellcenters.ravel(), dims='z')

    unit_conversion = convert_length_units('inches', 'meters')

    def get_period_values(start, end):
        period_data = ds['net_infiltration'].loc[start:end].mean(axis=0)
        dsi = period_data.interp(x=x,
                                 y=y,
                                 method='linear',
                                 kwargs={
                                     'fill_value': np.nan,
                                     'bounds_error': True
                                 })
        data = dsi.values * unit_conversion
        return np.reshape(data, (m.nrow, m.ncol))

    # test steady-state avg. across all data
    values = get_period_values('2012-01-01', '2017-12-31')

    #assert np.allclose(values, m.rch.recharge.array[0, 0])
    # test period 1 avg. for those times
    values1 = get_period_values(m.perioddata['start_datetime'].values[1],
                                m.perioddata['end_datetime'].values[1])
    assert testing.rpd(values1.mean(), m.rch.recharge.array[1,
                                                            0].mean()) < 0.01

    # check that nodata are written as 0.
    tmp = rch.recharge.array[:2].copy()
    tmp[0, 0, 0, 0] = np.nan
    tmp = {i: arr[0] for i, arr in enumerate(tmp)}
    m._setup_array('rch',
                   'recharge',
                   datatype='transient2d',
                   data=tmp,
                   write_fmt='%.6e',
                   write_nodata=0.)
    rech0 = load_array(m.cfg['rch']['recharge'][0])
    assert rech0[0, 0] == 0.
    assert rech0.min() >= 0.
    assert np.allclose(m.rch.recharge.array[0, 0].ravel(), rech0.ravel())
コード例 #6
0
ファイル: mfnwtmodel.py プロジェクト: surajitdb/modflow-setup
    def setup_dis(self):
        """"""
        package = 'dis'
        print('\nSetting up {} package...'.format(package.upper()))
        t0 = time.time()

        # resample the top from the DEM
        if self.cfg['dis']['remake_top']:
            self._setup_array(package,
                              'top',
                              datatype='array2d',
                              resample_method='linear',
                              write_fmt='%.2f')

        # make the botm array
        self._setup_array(package,
                          'botm',
                          datatype='array3d',
                          resample_method='linear',
                          write_fmt='%.2f')

        # put together keyword arguments for dis package
        kwargs = self.cfg['grid'].copy()  # nrow, ncol, delr, delc
        kwargs.update(self.cfg['dis'])  # nper, nlay, etc.
        kwargs = get_input_arguments(kwargs, fm.ModflowDis)
        # we need flopy to read the intermediate files
        # (it will write the files in cfg)
        lmult = convert_length_units('meters', self.length_units)
        kwargs.update({
            'top': self.cfg['intermediate_data']['top'][0],
            'botm': self.cfg['intermediate_data']['botm'],
            'nper': self.nper,
            'delc': self.modelgrid.delc * lmult,
            'delr': self.modelgrid.delr * lmult
        })
        for arg in ['perlen', 'nstp', 'tsmult', 'steady']:
            kwargs[arg] = self.perioddata[arg].values

        dis = fm.ModflowDis(model=self, **kwargs)
        self._perioddata = None  # reset perioddata
        #if not isinstance(self._modelgrid, MFsetupGrid):
        #    self._modelgrid = None  # override DIS package grid setup
        self.setup_grid()  # reset the model grid
        self._reset_bc_arrays()
        #self._isbc = None  # reset BC property arrays
        print("finished in {:.2f}s\n".format(time.time() - t0))
        return dis
コード例 #7
0
    def make_production_zones(self,
                              production_zones,
                              default_elevation_units='feet'):
        """ Make dictionary attributes for production zones.
        These are used to assign individual wells to production zones.
        The defaultdict is keyed by zone_name and then SITE_NO.

        Parameters
        ----------
        zonelist: list of lists
            List of production zone information, each zone requires a 
            list with [zone_name, zone_top, zone_bot]
        zone_name: str
            name assigned to prodcuction zone
        zone_top: str
            path to raster with top of zone
        zone_bot: str
            path to raster to bottom of zone
        key: str
            key (column name) to use in the resulting
            parameter zone dictionaries.  Defaults to SITE_NO
        """

        # if only one list is passed, put it into a list.
        #if isinstance(zonelist[0], str):
        #    zonelist = [zonelist]
        key = self.site_no_col

        # get tops and bottoms of estimated production intervals at each well
        # make dictionaries to lookup by well
        for name, info in production_zones.items():
            top_raster, botm_raster, *units = info
            units = units[0] if units else default_elevation_units
            x = self.df['x'].values
            y = self.df['y'].values
            length_unit_conversion = convert_length_units(
                units, self.model_length_units)
            top_elevations = raster.get_values_at_points(
                top_raster, x=x, y=y) * length_unit_conversion
            self.prod_zone_top[name] = dict(zip(self.df[key], top_elevations))
            botm_elevations = raster.get_values_at_points(
                botm_raster, x=x, y=y) * length_unit_conversion
            self.prod_zone_bot[name] = dict(zip(self.df[key], botm_elevations))
            self.df['{}_top'.format(name)] = top_elevations
            self.df['{}_botm'.format(name)] = botm_elevations
コード例 #8
0
def test_convert_length_units():
    assert np.allclose(convert_length_units(2, 1), 1 / .3048)
    assert np.allclose(convert_length_units(1, 2), .3048)
    assert np.allclose(convert_length_units('meters', 'feet'), 1 / .3048)
    assert np.allclose(convert_length_units('feet', 'meters'), .3048)
    assert np.allclose(convert_length_units('m', 'ft'), 1 / .3048)
    assert np.allclose(convert_length_units('ft', 'm'), .3048)
    assert np.allclose(convert_length_units(None, 'm'), 1.)
    assert np.allclose(convert_length_units('millimeters', 'meters'), 1 / 1000)
    assert np.allclose(convert_length_units('meters', 'millimeters'), 1000)
    assert np.allclose(convert_length_units('meters', 'km'), 0.001)
    assert np.allclose(convert_length_units('kilometers', 'meters'), 1000)
    assert np.allclose(convert_length_units('kilometers', 'cm'), 1000 * 100)
コード例 #9
0
ファイル: bcs.py プロジェクト: surajitdb/modflow-setup
def setup_ghb_data(model):

    m = model
    source_data = model.cfg['ghb'].get('source_data').copy()
    # get the GHB cells
    # todo: generalize more of the GHB setup code and move it somewhere else
    if 'shapefile' in source_data:
        shapefile_data = source_data['shapefile']
        key = [k for k in shapefile_data.keys() if 'filename' in k.lower()][0]
        shapefile_name = shapefile_data.pop(key)
        ghbcells = rasterize(shapefile_name, m.modelgrid, **shapefile_data)
    else:
        raise NotImplementedError('Only shapefile input supported for GHBs')

    cond = model.cfg['ghb'].get('cond')
    if cond is None:
        raise KeyError("key 'cond' not found in GHB yaml input. "
                       "Must supply conductance via this key for GHB setup.")

    # sample DEM for minimum elevation in each cell with a GHB
    # todo: GHB: allow time-varying bheads via csv input
    vertices = np.array(m.modelgrid.vertices)[ghbcells.flat > 0, :, :]
    polygons = [Polygon(vrts) for vrts in vertices]
    if 'dem' in source_data:
        key = [
            k for k in source_data['dem'].keys() if 'filename' in k.lower()
        ][0]
        dem_filename = source_data['dem'].pop(key)
        with rasterio.open(dem_filename) as src:
            meta = src.meta

        # reproject the polygons to the dem crs if needed
        try:
            from gisutils import get_authority_crs
            dem_crs = get_authority_crs(src.crs)
        except:
            dem_crs = pyproj.crs.CRS.from_user_input(src.crs)
        if dem_crs != m.modelgrid.crs:
            polygons = project(polygons, m.modelgrid.crs, dem_crs)

        all_touched = False
        if meta['transform'][0] > m.modelgrid.delr[0]:
            all_touched = True
        results = zonal_stats(polygons,
                              dem_filename,
                              stats='min',
                              all_touched=all_touched)
        min_elevs = np.ones((m.nrow * m.ncol), dtype=float) * np.nan
        min_elevs[ghbcells.flat > 0] = np.array([r['min'] for r in results])
        units_key = [k for k in source_data['dem'] if 'units' in k]
        if len(units_key) > 0:
            min_elevs *= convert_length_units(source_data['dem'][units_key[0]],
                                              model.length_units)
        min_elevs = np.reshape(min_elevs, (m.nrow, m.ncol))
    else:
        raise NotImplementedError(
            'Must supply DEM to sample for GHB elevations\n'
            '(GHB: source_data: dem:)')

    # make a DataFrame with MODFLOW input
    i, j = np.indices((m.nrow, m.ncol))
    df = pd.DataFrame({
        'per': 0,
        'k': 0,
        'i': i.flat,
        'j': j.flat,
        'bhead': min_elevs.flat,
        'cond': cond
    })
    df.dropna(axis=0, inplace=True)

    # assign layers so that bhead is above botms
    df['k'] = get_layer(model.dis.botm.array, df.i, df.j, df.bhead)
    # remove GHB cells from places where the specified head is below the model
    below_bottom_of_model = df.bhead < model.dis.botm.array[-1, df.i,
                                                            df.j] + 0.01
    df = df.loc[~below_bottom_of_model].copy()

    # exclude inactive cells
    k, i, j = df.k, df.i, df.j
    if model.version == 'mf6':
        active_cells = model.idomain[k, i, j] >= 1
    else:
        active_cells = model.ibound[k, i, j] >= 1
    df = df.loc[active_cells]
    return df
コード例 #10
0
    def __init__(self,
                 xlsx=None,
                 sheet=None,
                 csvfile=None,
                 site_no_col='SITE_NO',
                 x_coord_col='FROM_DEC_LONG_VA',
                 y_coord_col='FROM_DEC_LAT_VA',
                 start_date=None,
                 end_date=None,
                 source_crs=4269,
                 dest_crs=5070,
                 data_length_units='feet',
                 data_volume_units='mgal',
                 model_length_units='meters',
                 default_screen_len=20,
                 cols='default'):
        """ Constructor for the swuds class. Class methods will help pre-process
        water-use data for MAP models.  Dataframe produced by constructor
        has original SWUDS data, useful for debugging.

        Parameters
        ----------
        xlsx: str
            Path to xlsx data to be read.  If xlsx file is passed, then a
            selected worksheet from it will be converted to a csv file unless
            the csvfile parameter is specified as None.
        csvfile: str
            Path to csv file with data (if xlsx if None) or to a csvfile
            that is created from the selected worksheet.  If xlsx is None,
            then csvfile must be provided.
        sheet: str
            Name of worksheet in xlsx to be read, ignored if xlsx is None.
        cols: list of str
            List of columns to read from xlsx or csv, if None all columns are read.
            If 'default' (which is the default if nothing is specified) the default
            list of columns coded in the script is read.
        site_no_col : str, optional
            Column name in data with site identifiers,
            by default 'SITE_NO'
        x_coord_col : str, optional
            Column name in data with x-coordinates,
            by default 'x'
        y_coord_col : str, optional
            Column name in data with y-coordinates,
            by default 'y'
        start_date: str
            start time for simulation as string 'yyyy-mm-dd'
        end_date: str
            end time for simulation as string 'yyyy-mm-dd'
        source_crs : obj
            Coordinate reference system of the head observation locations.
            A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
            passed to :meth:`pyproj.crs.CRS.from_user_input`

            Can be any of:
              - PROJ string
              - Dictionary of PROJ parameters
              - PROJ keyword arguments for parameters
              - JSON string with PROJ parameters
              - CRS WKT string
              - An authority string [i.e. 'epsg:4326']
              - An EPSG integer code [i.e. 4326]
              - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
              - An object with a `to_wkt` method.
              - A :class:`pyproj.crs.CRS` class

            By default, epsg:4269
        dest_crs : obj
            Coordinate reference system of the model. Same input types
            as ``source_crs``.
            By default, epsg:5070
        data_length_units : str; 'meters', 'feet', etc.
            Units of lengths in data (elevations, depths, etc.)
            by default, 'feet'
        data_volume_units : str; 'mgd', 'ft3', etc
            Volumetric unit of pumping rates
            by default, 'mgd' (million gallons per day)
        model_length_units : str; 'meters', 'feet', etc.
            Length units of model.
            by default, 'meters'

        Attributes
        ----------
        df: pandas dataframe
            pandas dataframe read from spreadsheet or csv.  Manipulated by other
            methods of the class
        aquifer_names: dict
            dictionary of aquifer names keyed by NWIS codes, read using import 
            statement from mapgwm.lookups
        regional_aquifers: dict
            dictionary of regional aquifers keyed by NWIS codes, read using import
            statement from mapgwm.lookups
        crs: int
            pyproj.crs.CRS instance describing the output coordinate reference system
        monthly_cols: list of str
            list of column names for monthly values
        start_date: str
            start time for simulation as string 'yyyy-mm-dd'
        end_date: str
            end time for simulation as string 'yyyy-mm-dd'
        default_screen_len: float
            default screen length in meters
        locations: dict
            dictionary of x,y locations keyed by SITE_NO, added in reproject method
        depths: dict
            dictionary of depth keyed by SITE_NO
        well_elevations: dict
            dictionary of well elevations keyed by SITE_NO
        prod_zone_top: defaultdict(dict)
            defaultdict with production zone top (in meters) for each well
            First key is the production zone name, and second is the SITE_NO
            For example  self.prod_zone_top['lower_claiborne']['WEL001'] = top_elev
        prod_zone_bot: defaultdict(dict)
            defaultdict with production zone bottom (in meters) for each well
            First key is the production zone name, and second is the SITE_NO
            For example  self.prod_zone_bot['lower_claiborne']['WEL001'] = bot_elev
        
        """

        # set some attributes
        self.monthly_cols = [
            'JAN_VAL', 'FEB_VAL', 'MAR_VAL', 'APR_VAL', 'MAY_VAL', 'JUN_VAL',
            'JUL_VAL', 'AUG_VAL', 'SEP_VAL', 'OCT_VAL', 'NOV_VAL', 'DEC_VAL'
        ]
        self.aquifer_names = aq_codes_dict['aquifer_code_names']
        self.regional_aquifers = aq_codes_dict['regional_aquifer']
        self.start_date = start_date
        self.end_date = end_date
        self.default_screen_len = default_screen_len
        self.source_crs = source_crs
        self.dest_crs = dest_crs
        self.data_length_units = data_length_units
        self.data_volume_units = data_volume_units
        self.model_length_units = model_length_units
        self.prod_zone_top = defaultdict(dict)
        self.prod_zone_bot = defaultdict(dict)
        self.locations = dict()
        self.site_no_col = site_no_col

        # now read in excel or csv file
        defaultcols = [
            "SITE_NO", "WATER_CD", "FROM_DEC_LAT_VA", "FROM_DEC_LONG_VA",
            "FROM_WELL_DEPTH_VA", "FROM_ALT_VA", "FROM_NAT_WATER_USE_CD",
            "FROM_NAT_AQFR_CD", "FROM_NAT_AQFR_NM", "FROM_AQFR_CD",
            "FROM_AQFR_NM", "YEAR", "SALINITY_CD", "JAN_VAL", "FEB_VAL",
            "MAR_VAL", "APR_VAL", "MAY_VAL", "JUN_VAL", "JUL_VAL", "AUG_VAL",
            "SEP_VAL", "OCT_VAL", "NOV_VAL", "DEC_VAL", "ANNUAL_VAL",
            "FROM_STATE_NM", "FROM_COUNTY_NM", "FROM_CONSTRUCTION_DT",
            "FROM_INVENTORY_DT"
        ]

        if isinstance(cols, list):
            usecols = cols
        elif cols is not None:
            usecols = defaultcols
        else:
            usecols = None

        if xlsx is None and csvfile is None:
            raise ValueError(
                'both xlsx and csvfile cannont be None for SWUDS object')
        elif xlsx is None:
            self.df = pd.read_csv(csvfile, usecols=usecols)
        else:
            self.df = pd.read_excel(xlsx, sheet_name=sheet, usecols=usecols)
            if csvfile is not None:
                self.df.to_csv(csvfile, index=False)

        self.df.columns = self.df.columns.str.upper()
        # remove trailing spaces from code names
        if 'FROM_AQFR_CD' in self.df.columns:
            self.df["FROM_AQFR_CD"] = self.df["FROM_AQFR_CD"].str.strip()

        # make depths floats
        if 'FROM_WELL_DEPTH_VA' in self.df.columns:
            self.df['SCREEN_BOT'] = pd.to_numeric(
                self.df['FROM_WELL_DEPTH_VA'], errors='coerce')
        else:
            self.df['SCREEN_BOT'] = np.nan
        if 'FROM_ALT_VA' in self.df.columns:
            self.df['FROM_ALT_VA'] = pd.to_numeric(self.df['FROM_ALT_VA'],
                                                   errors='coerce')

        # make dictionaries
        length_conversion = convert_length_units(data_length_units,
                                                 model_length_units)
        self.depths = dict(
            list(
                zip(self.df['SITE_NO'],
                    self.df['SCREEN_BOT'] * length_conversion)))
        self.well_elevations = dict(
            zip(self.df['SITE_NO'],
                self.df['FROM_ALT_VA'] * length_conversion))

        self.sort_sites(primarysort=site_no_col)
        # Best to reproject on init so that we know the points are in the dest_crs
        self.reproject(x_coord_col=x_coord_col,
                       y_coord_col=y_coord_col,
                       key=site_no_col)
コード例 #11
0
    def get_data(self):

        # aggregate the data from multiple files
        dfs = []
        for id, f in self.filenames.items():
            meta = self.parse_header(f)
            df = pd.read_csv(f, skiprows=meta['skiprows'],
                             header=None, names=meta['column_names'])
            df.index = pd.to_datetime(df[self.datetime_column])
            df['start_datetime'] = df.index
            # check if data are monthly
            ndays0 = (df.index[1] - df.index[0]).days
            ismonthly = ndays0 >=28 & ndays0 <=31
            if ismonthly:
                ndays = df.index.days_in_month
                df['end_datetime'] = df['start_datetime'] + pd.to_timedelta(ndays, unit='D')
            elif ndays0 == 1:
                ndays = 1
                df['end_datetime'] = df['start_datetime']
            else:
                raise ValueError("Check {}; only monthly or daily values supported.")

            # convert precip to model units
            # assumes that precip is monthly values
            mult = convert_length_units(meta['length_units'],
                                        self.dest_model.length_units)
            df[meta['column_names'][1]] = df[meta['column_names'][1]] * mult/ndays

            # convert temperatures to C
            df[meta['column_names'][2]] = meta['temp_conversion'](df[meta['column_names'][2]])

            # record lake ID
            df[self.id_column] = id
            dfs.append(df)
        df = pd.concat(dfs)

        # sample values to model stress periods
        starttimes = self.dest_model.perioddata['start_datetime'].copy()
        endtimes = self.dest_model.perioddata['end_datetime'].copy()

        # if period ends are specified as the same as the next starttime
        # need to subtract a day, otherwise
        # pandas will include the first day of the next period in slices
        endtimes_equal_startimes = np.all(endtimes[:-1].values == starttimes[1:].values)
        #if endtimes_equal_startimes:
        #    endtimes -= pd.Timedelta(1, unit='d')

        period_data = []
        current_stat = None
        for kper, (start, end) in enumerate(zip(starttimes, endtimes)):
            # missing (period) keys default to 'mean';
            # 'none' to explicitly skip the stress period
            period_stat = self.period_stats.get(kper, current_stat)
            current_stat = period_stat
            aggregated = aggregate_dataframe_to_stress_period(df,
                                                              start_datetime=start,
                                                              end_datetime=end,
                                                              period_stat=period_stat,
                                                              id_column=self.id_column,
                                                              data_column=self.data_columns
                                                              )
            aggregated['per'] = kper
            period_data.append(aggregated)
        dfm = pd.concat(period_data)
        dfm.sort_values(by=['per', self.id_column], inplace=True)
        return dfm.reset_index(drop=True)
コード例 #12
0
def preprocess_te_wateruse(data,
                           start_date=None,
                           end_date=None,
                           active_area=None,
                           active_area_id_column=None,
                           active_area_feature_id=None,
                           estimated_production_zone_top=None,
                           estimated_production_zone_botm=None,
                           estimated_production_surface_units='feet',
                           source_crs=4269,
                           dest_crs=5070,
                           interp_method='linear',
                           data_volume_units='mgal',
                           model_length_units='meters',
                           outfile=None):
    """Preprocess water use data from thermoelectric power plants:

    * reproject data to a destination CRS `dest_crs`)
    * cull data to an area of interest (`active_area`)
    * if input data do not have information on the well screen intervals;
      sample screen tops and bottoms from raster surfaces bounding
      an estimated production zone (e.g. `estimated_production_zone_top`)
    * reindex the data to continous monthly values extending from `start_date`
      to `end_date`. Typically, these would bracket the time period for which
      the pumping should be simulated in a model. For example, the earliest data
      may be from 2010, but if the model starts in 2008, it may be appropriate to
      begin using the 2010 rates then (``start_date='2008'``). If no start or end
      date are given, the first and last years of pumping in `data` are used.
    * fill empty months by interpolation via a specified `interp_method`
    * backfill any remaining empty months going back to the `start_date`
    * write processed data to a CSV file and shapefile of the same name

    Parameters
    ----------
    data : DataFrame
        Thermoelectric water use data in the following format
        (similar to that output by :func:`mapgwm.te_wateruse.read_te_water_use_spreadsheet`):

        =============== =======================================================
        site_no         power plant identifier (plant code)
        start_datetime  pandas datetime representative of flux (e.g. '2010')
        x               x-coordinate of withdrawl, in `source_crs`
        y               y-coordinate of withdrawl, in `source_crs`
        q               withdrawl flux, in `data_volume_units` per days
        =============== =======================================================

    start_date : str
        Start date for pumping rates. If earlier than the dates in `data`,
        pumping rates will be backfilled to this date.
    end_date : str
        End date for pumping rates. If later than the dates in `data`,
        pumping rates will be forward filled to this date.
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    estimated_production_zone_top : file path
        Raster surface for assigning screen tops
    estimated_production_zone_botm : file path
        Raster surface for assigning screen bottoms
    estimated_production_surface_units : str, {'meters', 'ft', etc.}
        Length units of elevations in estimated production surface rasters.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269
    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    interp_method : str
        Interpolation method to use for filling pumping rates to monthly values.
        By default, 'linear'
    data_volume_units : str; e.g. 'mgal', 'm3', 'cubic feet', etc.
        Volume units of pumping data. All time units are assumed to be in days.
    model_length_units : str; e.g. 'feet', 'm', 'meters', etc.
        Length units of model.
    outfile : str
        Path for output file. A shapefile of the same name is also written.
        If None, no output file is written. By default, None

    Returns
    -------
    df_monthly : DataFrame
        

    Notes
    -----
    * time units for TE data and model are assumed to be days

    """
    df = data.copy()

    # reproject to dest_crs
    x, y = project(zip(df['x'], df['y']), source_crs, dest_crs)
    df['x'], df['y'] = x, y
    df['geometry'] = [Point(x, y) for x, y in zip(x, y)]

    # drop wells with no location information (for now)
    df.dropna(subset=['x', 'y'], axis=0, inplace=True)

    # cull sites to those within the Delta footprint
    # cull data to that within the model area
    if active_area is not None:
        df = cull_data_to_active_area(df,
                                      active_area,
                                      active_area_id_column,
                                      active_area_feature_id,
                                      data_crs=dest_crs)

    # get top and bottom of estimated production interval at each well
    if estimated_production_zone_top is not None and \
            estimated_production_zone_botm is not None:
        surf_unit_conversion = convert_length_units(
            estimated_production_surface_units, model_length_units)
        x, y = df.x.values, df.y.values
        est_screen_top = get_values_at_points(estimated_production_zone_top,
                                              x,
                                              y,
                                              points_crs=dest_crs)
        est_screen_top *= surf_unit_conversion
        est_screen_botm = get_values_at_points(estimated_production_zone_botm,
                                               x,
                                               y,
                                               points_crs=dest_crs)
        est_screen_botm *= surf_unit_conversion
        df['screen_top'] = est_screen_top
        df['screen_botm'] = est_screen_botm

    # distribute fluxes to monthly values
    # set start and end dates if not already set
    if start_date is None:
        start_date = df.start_datetime.min()
    if end_date is None:
        end_date = df.start_datetime.mmax()
    groups = df.groupby('site_no')
    all_groups = []
    for site_no, group in groups:
        dfg = group.copy()

        # create a continuous monthly time index
        # labeled at the month start
        all_dates = pd.date_range(start_date, end_date, freq='MS')
        dfg.index = dfg['start_datetime']
        dfg = dfg.reindex(all_dates)

        # interpolate the discharge values;
        # back filling to the start date
        dfg['q'] = dfg.q.interpolate(method=interp_method).bfill()
        dfg['q'] *= convert_volume_units(data_volume_units, model_length_units)

        # fill remaining columns
        dfg['start_datetime'] = dfg.index
        fill_columns = set(dfg.columns).difference({'q', 'start_datetime'})
        fill_values = group.iloc[0].to_dict()
        for c in fill_columns:
            dfg[c] = fill_values[c]

        # add 'te' prefix to site number
        dfg['site_no'] = f'te_{site_no}'
        all_groups.append(dfg)
    df_monthly = pd.concat(all_groups)

    # assume most values represent abstraction
    # if sum is positive, invert so that output values are negative
    if df_monthly['q'].sum() > 0:
        df_monthly['q'] *= -1

    # clean up the columns
    cols = [
        'site_no', 'start_datetime', 'x', 'y', 'screen_top', 'screen_botm',
        'q', 'geometry'
    ]
    cols += list(set(df_monthly.columns).difference(cols))
    df_monthly = df_monthly[cols]

    # write the output
    if outfile is not None:
        outfile = Path(outfile)
        df_monthly.drop('geometry', axis=1).to_csv(outfile,
                                                   index=False,
                                                   float_format='%g')
        print('wrote {}'.format(outfile))

        # write only unique pumping values to shapefile
        to_shapefile = df_monthly.groupby(['site_no',
                                           'q']).first().reset_index()
        shapefile = outfile.with_suffix('.shp')
        df2shp(to_shapefile, shapefile, crs=dest_crs)
    return df_monthly
コード例 #13
0
def test_dis_setup(pfl_nwt_with_grid):

    m = pfl_nwt_with_grid  #deepcopy(pfl_nwt_with_grid)

    # test intermediate array creation
    m.cfg['dis']['source_data']['top']['elevation_units'] = 'meters'
    m.cfg['dis']['lenuni'] = 2  # meters
    m.cfg['dis']['remake_top'] = True
    dis = m.setup_dis()
    assert 'DIS' in m.get_package_list()
    arrayfiles = m.cfg['intermediate_data']['top'] +\
                 m.cfg['intermediate_data']['botm']
    for f in arrayfiles:
        assert os.path.exists(f)

    # verify that modelgrid was reset after building DIS
    mg = m.modelgrid
    assert (mg.nlay, mg.nrow, mg.ncol) == m.dis.botm.array.shape
    assert np.array_equal(mg.top, m.dis.top.array)
    assert np.array_equal(mg.botm, m.dis.botm.array)

    # test using previously made external files as input
    if m.version == 'mf6':
        assert m.cfg['dis']['top'] == m.cfg['external_files']['top']
        assert m.cfg['dis']['botm'] == m.cfg['external_files']['botm']
    else:
        assert m.cfg['dis']['top'] == m.cfg['intermediate_data']['top']
        assert m.cfg['dis']['botm'] == m.cfg['intermediate_data']['botm']
    m.cfg['dis']['remake_top'] = False
    m.cfg['dis']['nper'] = 4
    m.cfg['dis']['perlen'] = [1, 1, 1, 1]
    m.cfg['dis']['nstp'] = [1, 1, 1, 1]
    m.cfg['dis']['tsmult'] = [1, 1, 1, 1]
    m.cfg['dis']['steady'] = [1, 0, 0, 1]
    dis = m.setup_dis()
    dis.write_file()
    arrayfiles = m.cfg['external_files']['top'] + \
                 m.cfg['external_files']['botm']
    for f in arrayfiles:
        assert os.path.exists(f)
    assert os.path.exists(dis.fn_path)

    # check settings
    assert m.cfg['dis']['steady'] == [True, False, False, True]
    assert dis.steady.array.tolist() == [True, False, False, True]

    # test unit conversion
    top_m = dis.top.array.copy()
    botm_m = dis.botm.array.copy()
    m.cfg['dis']['top'] = None  # arrays don't get remade if this has data
    m.cfg['dis']['botm'] = None
    del m.cfg['setup_grid']['top']
    del m.cfg['setup_grid']['botm']
    m.cfg['dis']['remake_top'] = True
    m.cfg['dis']['lenuni'] = 1 # feet
    m.cfg['setup_grid']['dxy'] = 20/.3048
    m.remove_package('DIS')
    m.setup_grid()
    #m._reset_bc_arrays()
    assert m.cfg['parent']['length_units'] == 'meters'
    assert m.cfg['parent']['time_units'] == 'days'
    assert m.length_units == 'feet'
    dis = m.setup_dis()
    assert np.allclose(dis.top.array.mean() * convert_length_units(1, 2), top_m.mean())
    assert np.allclose(dis.botm.array.mean() * convert_length_units(1, 2), botm_m.mean())
コード例 #14
0
def test_parse_source_data(source_data_cases,
                           source_data_from_model_cases,
                           pfl_nwt_with_grid, project_root_path):
    model = pfl_nwt_with_grid
    cases = source_data_cases + source_data_from_model_cases
    results = []

    sd = TabularSourceData.from_config(cases[0], type='tabular')
    assert isinstance(sd.filenames, dict)
    assert sd.length_unit_conversion == 1.
    assert sd.time_unit_conversion == 1.
    assert sd.unit_conversion == 1.

    sd = TabularSourceData.from_config(cases[1], type='tabular')
    assert isinstance(sd.filenames, dict)

    sd = TabularSourceData.from_config(cases[2], type='tabular')
    assert isinstance(sd.filenames, dict)

    sd = TabularSourceData.from_config(cases[3]['features_shapefile'])
    assert isinstance(sd.filenames, dict)

    var = 'rech'
    sd = ArraySourceData.from_config(cases[4]['infiltration_arrays'],
                                variable=var,
                                type='array')
    assert isinstance(sd.filenames, dict)
    assert sd.unit_conversion == 1. # no dest model

    sd = TabularSourceData.from_config(cases[9]['flowlines']['nhdplus_paths'])
    assert isinstance(sd.filenames, dict)

    # test conversion to model units
    for i, f in cases[4]['infiltration_arrays']['filenames'].items():
        cases[4]['infiltration_arrays']['filenames'][i] = os.path.join(project_root_path, f)
    sd = ArraySourceData.from_config(cases[4]['infiltration_arrays'],
                                     variable=var,
                                     dest_model=model)
    assert isinstance(sd.filenames, dict)
    assert sd.unit_conversion == convert_length_units('inches', 'meters') *\
        convert_time_units('years', 'days')
    data = sd.get_data()
    assert isinstance(data, dict)
    assert len(data) == len(cases[4]['infiltration_arrays']['filenames'])
    assert data[0].shape == model.modelgrid.shape[1:]
    assert sd.unit_conversion == 1/12 * .3048 * 1/365.25

    # test averaging of layer between two files
    sd = ArraySourceData.from_config(cases[6]['hk'],
                                     variable='hk',
                                     dest_model=model)
    data = sd.get_data()
    assert isinstance(sd.filenames, dict)
    assert np.allclose(data[1].mean(axis=(0, 1)), cases[6]['hk'][1])

    # test averaging of layers provided in source array
    sd = ArraySourceData.from_config(source_data_from_model_cases[0],
                                     variable='botm',
                                     dest_model=model)
    data = sd.get_data()
    mask = sd._source_grid_mask
    arr0 = sd.regrid_from_source_model(sd.source_array[0],
                                        mask=mask,
                                        method='linear')
    arr1 = sd.regrid_from_source_model(sd.source_array[1],
                                        mask=mask,
                                        method='linear')
    assert np.allclose(np.mean([arr0, arr1], axis=(0)), data[0])

    # TODO: write test for multiplier intermediate layers

    # test mapping of layers from binary file;
    # based on layer bottom mapping
    filename = source_data_from_model_cases[2]['from_parent']['binaryfile']
    source_model = pfl_nwt_with_grid.parent
    modelname = 'parent'
    pfl_nwt_with_grid._parent_layers = {0: -0.5, 1: 0, 2: 1, 3: 2, 4: 3}
    sd = MFBinaryArraySourceData(variable='strt', filename=filename,
                                 dest_model=model,
                                 source_modelgrid=source_model.modelgrid,
                                 from_source_model_layers={},
                                 length_units=model.cfg[modelname]['length_units'],
                                 time_units=model.cfg[modelname]['time_units'])
    data = sd.get_data()
    # first two layers in dest model should both be from parent layer 0
    mask = sd._source_grid_mask
    arr0 = sd.regrid_from_source_model(sd.source_array[0],
                                       mask=mask,
                                       method='linear')
    assert np.array_equal(data[0], data[1])
    assert np.array_equal(arr0, data[0])
    pfl_nwt_with_grid._parent_layers = None # reset
コード例 #15
0
ファイル: tmr.py プロジェクト: wkitlasten/modflow-setup
    def __init__(
        self,
        parent_model,
        inset_model,
        parent_head_file=None,
        parent_cell_budget_file=None,
        parent_length_units=None,
        inset_length_units=None,
        inset_parent_layer_mapping=None,
        inset_parent_period_mapping=None,
    ):

        self.inset = inset_model
        self.parent = parent_model
        self.inset._set_parent_modelgrid()
        self.cbc = None
        self._inset_parent_layer_mapping = inset_parent_layer_mapping
        self._source_mask = None
        self._inset_parent_period_mapping = inset_parent_period_mapping
        self.hpth = None  # path to parent heads output file
        self.cpth = None  # path to parent cell budget output file

        self.pi0 = None
        self.pj0 = None
        self.pi1 = None
        self.pj1 = None
        self.pi_list = None
        self.pj_list = None

        if parent_length_units is None:
            parent_length_units = self.inset.cfg['parent']['length_units']
        if inset_length_units is None:
            inset_length_units = self.inset.length_units
        self.length_unit_conversion = convert_length_units(
            parent_length_units, inset_length_units)

        if parent_head_file is None:
            parent_head_file = os.path.join(self.parent.model_ws,
                                            '{}.hds'.format(self.parent.name))
            if os.path.exists(parent_head_file):
                self.hpth = parent_cell_budget_file
        else:
            self.hpth = parent_head_file
        if parent_cell_budget_file is None:
            for extension in 'cbc', 'cbb':
                parent_cell_budget_file = os.path.join(
                    self.parent.model_ws,
                    '{}.{}'.format(self.parent.name, extension))
                if os.path.exists(parent_cell_budget_file):
                    self.cpth = parent_cell_budget_file
                    break
        else:
            self.cpth = parent_cell_budget_file

        if self.hpth is None and self.cpth is None:
            raise ValueError(
                "No head or cell budget output files found for parent model {}"
                .format(self.parent.name))

        # get bounding cells in parent model for pfl_nwt model
        irregular_domain = False

        # see if irregular domain
        irregbound_cfg = self.inset.cfg['perimeter_boundary'].get(
            'source_data', {}).get('irregular_boundary')
        if irregbound_cfg is not None:
            irregular_domain = True
            irregbound_cfg['variable'] = 'perimeter_boundary'
            irregbound_cfg['dest_model'] = self.inset

            sd = ArraySourceData.from_config(irregbound_cfg)
            data = sd.get_data()
            idm_outline = data[0]
            connections = get_horizontal_connections(idm_outline,
                                                     connection_info=False,
                                                     layer_elevations=1,
                                                     delr=1,
                                                     delc=1,
                                                     inside=True)
            self.pi_list, self.pj_list = connections.i.to_list(
            ), connections.j.to_list()
        # otherwise just get the corners of the inset if rectangular domain
        else:
            self.pi0, self.pj0 = get_ij(
                self.parent.modelgrid, self.inset.modelgrid.xcellcenters[0, 0],
                self.inset.modelgrid.ycellcenters[0, 0])
            self.pi1, self.pj1 = get_ij(
                self.parent.modelgrid, self.inset.modelgrid.xcellcenters[-1,
                                                                         -1],
                self.inset.modelgrid.ycellcenters[-1, -1])
            self.parent_nrow_in_inset = self.pi1 - self.pi0 + 1
            self.parent_ncol_in_inset = self.pj1 - self.pj0 + 1

        # check for an even number of pfl_nwt cells per parent cell in x and y directions
        x_refinment = self.parent.modelgrid.delr[
            0] / self.inset.modelgrid.delr[0]
        y_refinment = self.parent.modelgrid.delc[
            0] / self.inset.modelgrid.delc[0]
        assert int(
            x_refinment
        ) == x_refinment, "pfl_nwt delr must be factor of parent delr"
        assert int(
            y_refinment
        ) == y_refinment, "pfl_nwt delc must be factor of parent delc"
        assert x_refinment == y_refinment, "grid must have same x and y discretization"
        self.refinement = int(x_refinment)
コード例 #16
0
def preprocess_headobs(
        data,
        metadata,
        head_data_columns=['head', 'last_head', 'head_std'],
        dem=None,
        dem_units='meters',
        start_date='1998-04-01',
        active_area=None,
        active_area_id_column=None,
        active_area_feature_id=None,
        source_crs=4269,
        dest_crs=5070,
        data_length_units='meters',
        model_length_units='meters',
        geographic_groups=None,
        geographic_groups_col=None,
        max_obsname_len=None,
        outfile='../source_data/observations/head_obs/preprocessed_head_obs.csv'
):
    """Preprocess head observation data, for example, groundwater level data output from the
    `visGWDB program <https://doi.org/10.5066/P9W004O6>`_.

    * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates)
      to the CRS of the model (`dest_crs`)
    * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area
    * length units are converted to those of the groundwater model. Open intervals for the wells are
      converted from depths to elevations
    * missing open intervals are filled based on well bottom depths (if availabile) and the median open
      interval length for the dataset.
    * Wells are categorized based on the quality of the open interval information (see the documentation
      for :func:`mapgwm.headobs.fill_well_open_intervals`).
    * Prefixes for observation names (with an optional length limit) that identify the location are generated
    * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons
      (`aoi` parameter)

    Parameters
    ----------
    data : DataFrame
        Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`.
        Columns:

        ========= ================================================================
        site_no   site identifier
        lat       lattitude
        lon       longitude
        datetime  measurement dates in pandas datetime format
        head      average head for the period represented by the datetime
        last_head last head measurement for the period represented by the datetime
        head_std  standard deviation of measured heads within the datetime period
        ========= ================================================================

        Notes:

        * lat and lon columns can alternatively be in the metadata table
        * `last_head` and `head_std` only need to be included if they are in
          `head_data_columns`

    metadata : DataFrame
        Head observation data, e.g. as output from :func:`mapgwm.headobs.get_data`.

        Must have the following columns:

        ================= ==========================================================================
        site_no (index)   site identifier
        aqfr_cd           Local aquifer code
        screen_botm       Well screen bottom, as a depth below land surface, in feet
        screen_top        Well screen top, as a depth below land surface, in feet
        well_depth        Well depth, in feet
        well_el           Altitude of land surface, in feet
        ================= ==========================================================================

    head_data_columns : list of strings
        Columns in data with head values or their statistics.
        By default, 'head', 'last_head', 'head_std', which allows both
        the average and last head values for the stress period to be considered,
        as well as the variability of water levels contributing to an average value.
    dem : str, optional
        DEM raster of the land surface. Used for estimating missing wellhead elevations.
        Any reprojection to dest_crs is handled automatically, assuming
        the DEM raster has CRS information embedded (arc-ascii grids do not!)
        By default, None.
    dem_units : str, {'feet', 'meters', ..}
        Units of DEM elevations, by default, 'meters'
    start_date : str (YYYY-mm-dd)
        Simulation start date (cull observations before this date)
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269

    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    data_length_units : str; 'meters', 'feet', etc.
        Length units of head observations.
    model_length_units : str; 'meters', 'feet', etc.
        Length units of model.
    geographic_groups : file, dict or list-like
        Option to group observations by area(s) of interest. Can
        be a shapefile, list of shapefiles, or dictionary of shapely polygons.
        A 'group' column will be created in the metadata, and observation
        sites within each polygon will be assigned the group name
        associated with that polygon.

        For example::

            geographic_groups='../source_data/extents/CompositeHydrographArea.shp'
            geographic_groups=['../source_data/extents/CompositeHydrographArea.shp']
            geographic_groups={'cha': <shapely Polygon>}

        Where 'cha' is an observation group name for observations located within the
        the area defined by CompositeHydrographArea.shp. For shapefiles,
        group names are provided in a `geographic_groups_col`.

    geographic_groups_col : str
        Field name in the `geographic_groups` shapefile(s) containing the
        observation group names associated with each polygon.

    max_obsname_len : int or None
        Maximum length for observation name prefix. Default of 13
        allows for a PEST obsnme of 20 characters or less with
        <prefix>_yyyydd or <prefix>_<per>d<per>
        (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1)
        If None, observation names will not be truncated. PEST++ does not have
        a limit on observation name length.
    outfile : str
        Where output file will be written. Metadata are written to a file
        with the same name, with an additional "_info" suffix prior to
        the file extension.

    Returns
    -------
    df : DataFrame
        Preprocessed time series
    well_info : DataFrame
        Preprocessed metadata

    References
    ----------
    `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>`
    """

    df = data.copy()
    # multiplier to convert input length units to model units
    unit_conversion = convert_length_units(data_length_units,
                                           model_length_units)

    # outputs
    out_plot = None
    if outfile is not None:
        outpath, filename = os.path.split(outfile)
        makedirs(outpath)
        outname, ext = os.path.splitext(outfile)
        out_info_csvfile = outname + '_info.csv'
        out_data_csvfile = outfile
        out_plot = os.path.join(outpath, 'open_interval_lengths.pdf')
        out_shapefile = outname + '_info.shp'

    # set the starting and ending dates here
    stdate = pd.Timestamp(start_date)

    # convert to datetime; drop the timestamps
    df['datetime'] = pd.to_datetime(df.datetime).dt.normalize()

    # trim to the time range
    n_measurements = len(data)
    n_sites = len(set(data.site_no))
    print(
        f'starting with {n_measurements:,d} measurements at {n_sites:,d} unique wells'
    )
    no_data_in_period = df.datetime < stdate

    if np.any(no_data_in_period):
        in_period = df.datetime >= stdate
        n_sites_before = len(
            set(df.loc[no_data_in_period,
                       'site_no']).difference(set(df.loc[in_period,
                                                         'site_no'])))
        print((
            f'culling {in_period.sum():,d} measurements from {n_sites_before:,d} '
            f'sites that are prior to start date of {start_date}'))
        df = df.loc[in_period]

    # collapse dataset to mean values at each site
    groups = df.groupby('site_no')
    well_info = groups.mean().copy()
    well_info = well_info.join(metadata, rsuffix='_meta')
    well_info['start_dt'] = groups.datetime.min()
    well_info['end_dt'] = groups.datetime.max()
    well_info.drop(labels=['year', 'month'], axis=1, inplace=True)
    well_info['site_no'] = well_info.index
    well_info['n'] = groups.datetime.count()

    # project x, y to model crs
    x_pr, y_pr = project((well_info.lon.values, well_info.lat.values),
                         source_crs, dest_crs)
    well_info.drop(['lon', 'lat'], axis=1, inplace=True)
    well_info['x'], well_info['y'] = x_pr, y_pr
    well_info['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)]

    # cull data to that within the model area
    if active_area is not None:
        df, md = cull_data_to_active_area(df,
                                          active_area,
                                          active_area_id_column,
                                          active_area_feature_id,
                                          data_crs=dest_crs,
                                          metadata=well_info)

    # convert length units; convert screen tops and botms to depths
    missing_elevations = well_info.well_el.isna()
    if dem is not None and np.any(missing_elevations):
        well_location_elevations = get_values_at_points(dem,
                                                        well_info['x'],
                                                        well_info['y'],
                                                        points_crs=dest_crs)
        well_location_elevations *= convert_length_units(
            dem_units, model_length_units)
        well_info.loc[missing_elevations,
                      'well_el'] = well_location_elevations[missing_elevations]

    length_columns = ['well_el'
                      ] + head_data_columns + ['screen_top', 'screen_botm']
    for col in length_columns:
        if col in well_info.columns:
            well_info[col] *= unit_conversion

    well_info['well_botm'] = well_info['well_el'] - well_info['well_depth']
    well_info['screen_top'] = well_info['well_el'] - well_info['screen_top']
    well_info['screen_botm'] = well_info['well_el'] - well_info['screen_botm']

    # just the data, site numbers, times and aquifer
    head_data_columns = head_data_columns + ['head_std']
    transient_cols = ['site_no', 'datetime'] + head_data_columns + ['n']
    transient_cols = [c for c in transient_cols if c in df.columns]
    df = df[transient_cols].copy()
    for c in head_data_columns:
        if c in df.columns:
            df[c] *= unit_conversion

    # #### trim down to only well_info with both estimated water levels and standard deviation
    # monthly measured levels may not have standard deviation
    # (as opposed to monthly statistical estimates)
    criteria = pd.notnull(well_info['head'])
    #if 'head_std' in df.columns:
    #    criteria = criteria & pd.notnull(well_info['head_std'])
    well_info = well_info[criteria]

    # verify that all well_info have a wellhead elevation
    assert not np.any(np.isnan(well_info.well_el))

    # categorize wells based on quality of open interval information
    # estimate missing open intervals where possible
    well_info = fill_well_open_intervals(well_info, out_plot=out_plot)

    # drop well_info with negative reported open interval
    #well_info = well_info.loc[open_interval_length > 0]

    # cull data to well_info in well info table
    has_metadata = df.site_no.isin(well_info.index)
    if np.any(~has_metadata):
        warnings.warn('culling {} wells not found in metadata table!'.format(
            np.sum(~has_metadata)))
        df = df.loc[has_metadata].copy()

    # make unique n-character prefixes (site identifiers) for each observation location
    # 13 character length allows for prefix_yyyymmm in 20 character observation names
    # (BeoPEST limit)
    unique_obsnames = set()
    obsnames = []
    for sn in well_info.index.tolist():
        if max_obsname_len is not None:
            name = make_obsname(sn,
                                unique_names=unique_obsnames,
                                maxlen=max_obsname_len)
            assert name not in unique_obsnames
        else:
            name = sn
        unique_obsnames.add(name)
        obsnames.append(name)
    well_info['obsprefix'] = obsnames
    obsprefix = dict(zip(well_info.index, well_info.obsprefix))
    df['obsprefix'] = [obsprefix[sn] for sn in df.site_no]

    # add area of interest information
    well_info['group'] = 'heads'
    well_info = assign_geographic_obsgroups(well_info,
                                            geographic_groups,
                                            geographic_groups_col,
                                            metadata_crs=dest_crs)

    # save out the results
    if outfile is not None:
        df2shp(well_info.drop(['x', 'y'], axis=1),
               out_shapefile,
               index=False,
               crs=dest_crs)
        print('writing {}'.format(out_info_csvfile))
        well_info.drop('geometry', axis=1).to_csv(out_info_csvfile,
                                                  index=False,
                                                  float_format='%.2f')
        print('writing {}'.format(out_data_csvfile))
        df.to_csv(out_data_csvfile, index=False, float_format='%.2f')
    return df, well_info
コード例 #17
0
def preprocess_iwum_pumping(ncfile,
                            start_date=None,
                            end_date=None,
                            active_area=None,
                            active_area_id_column=None,
                            active_area_feature_id=None,
                            estimated_production_zone_top=None,
                            estimated_production_zone_botm=None,
                            flux_variable='value',
                            nc_crs=5070,
                            dest_crs=5070,
                            nc_length_units='meters',
                            estimated_production_surface_units='meters',
                            model_length_units='meters',
                            outfile=None):
    """Get pumping from the Irrigation Water Use Model (IWUM; Wilson, 2020) output and
    assign open interval information, using raster surfaces of the
    top and bottom of an estimated production zone.

    Parameters
    ----------
    ncfile : file path
        NetCDF output from Irrigation Water Use Model
    start_date : str
        Cull data before this date.
    end_date : str
        Cull data after this date.
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    estimated_production_zone_top : file path
        Raster surface for assigning screen tops
    estimated_production_zone_botm : file path
        Raster surface for assigning screen bottoms
    flux_variable : str
        Varible in ncfile for pumping fluxes. Fluxes are assumed to
        represent total volumes for each time period.
    nc_crs : obj
        Coordinate Reference System (CRS) of ncfile.
        A Python int, dict, str, or pyproj.crs.CRS instance
        passed to the pyproj.crs.from_user_input
        See http://pyproj4.github.io/pyproj/stable/api/crs/crs.html#pyproj.crs.CRS.from_user_input.
        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class
    nc_length_units : str, {'meters', 'ft', etc.}
        Length units of pumped volumes in ncfile
    estimated_production_surface_units : str, {'meters', 'ft', etc.}
        Length units of elevations in estimated production surface rasters.
    model_length_units : str, {'meters', 'ft', etc.}
        Length units of model.
    outfile : csv file for output table

    Returns
    -------
    df : DataFrame
        Table of pumping rates in m3/day, location
        and open interval information.

        Columns:

        ============== ================================================
        site_no        index position of pumping rate in ncfile grid
        x              x-coordinate in `dest_crs`
        y              y-coordinate in `dest_crs`
        start_datetime start date of pumping period
        end_datetime   end date of pumping period
        screen_top     screen top elevation, in `model_length_units`
        screen_botm    screen bottom elevation, in `model_length_units`
        q              pumping rate, in model units
        geometry       shapely Point object representing location
        ============== ================================================

    Notes
    -----
    * Time units are assumed to be days.
    * Fluxes are assumed to represent total volumes for each time period
      indicated by the differences between successive values along the time axis of ncfile.
    """
    ds = xr.open_dataset(ncfile)
    time_variable = [k for k in ds.coords.keys() if k.lower() not in {'x', 'y'}][0]
    ds_x, ds_y = np.meshgrid(ds['x'], ds['y'])

    # original values are in m3, in each 1 mi2 cell
    # can leave in m3 if reassigning to 1km grid as point values
    length_conversion = convert_volume_units(nc_length_units,
                                             model_length_units) ** 3
    unit_suffix = vol_suffix[model_length_units] + 'd'
    flux_col = 'q'  # 'flux_{}'.format(unit_suffix)  # output field name for fluxes

    # get top/botm elevations
    est_screen_top = None
    est_screen_botm = None
    if estimated_production_zone_top is not None and \
            estimated_production_zone_botm is not None:
        surf_unit_conversion = convert_length_units(estimated_production_surface_units,
                                                    model_length_units)
        est_screen_top = get_values_at_points(estimated_production_zone_top, ds_x, ds_y,
                                                points_crs=nc_crs)
        est_screen_top *= surf_unit_conversion
        est_screen_botm = get_values_at_points(estimated_production_zone_botm, ds_x, ds_y,
                                                 points_crs=nc_crs)
        est_screen_botm *= surf_unit_conversion

        # in any places where screen top is less than the screen botm,
        # set both at the mean
        loc = est_screen_top < est_screen_botm
        means = np.mean([est_screen_top, est_screen_botm], axis=0)
        est_screen_top[loc] = means[loc]
        est_screen_botm[loc] = means[loc]
        print(f'Reset screen top and bottom to mean elevation at {loc.ravel().sum()} '
              f'locations where screen top was < screen bottom')

    dfs = []
    times = pd.DatetimeIndex(ds[time_variable].loc[start_date:end_date].values)
    for n, period_start_date in enumerate(times):

        # for each time entry, get the data
        kwargs = {time_variable: period_start_date}
        arr = ds[flux_variable].sel(**kwargs).values

        # make sure pumping sign is  negative
        # based on assumption that values are mostly abstraction
        if arr.sum() > 0:
            arr *= -1

        # set up a dataframe
        data = {'site_no': np.arange(ds_x.size),
                'x': ds_x.ravel(),
                'y': ds_y.ravel(),
                 }
        if est_screen_top is not None and est_screen_botm is not None:
            data.update({'screen_top': est_screen_top.ravel(),
                         'screen_botm': est_screen_botm.ravel()
                         }
                        )
        df = pd.DataFrame(data)
        df['start_datetime'] = period_start_date

        # get the end_date, handling last entry
        if n + 1 < len(times):
            period_end_date = times[n + 1]
        else:
            # set end date for last period on previous period length
            last_start = dfs[-1]['start_datetime'].values[0]
            ndays = (pd.Timestamp(period_start_date) -
                     pd.Timestamp(last_start)).days
            period_end_date = period_start_date + pd.Timedelta(ndays, unit='d')

        # convert the time units
        ndays = (pd.Timestamp(period_end_date) -
                 pd.Timestamp(period_start_date)).days
        assert ndays > 0, "period_end_date {} is before period_start_date {}"\
            .format(period_end_date, period_start_date)
        time_conversion = 1 / ndays  # original quantities are volumes for the time period

        # time indexing in pandas is through last value
        period_end_date = pd.Timestamp(period_end_date) - pd.Timedelta(1, unit='d')
        df['end_datetime'] = period_end_date
        df[flux_col] = arr.ravel() * length_conversion * time_conversion

        # only includes fluxes > 0
        df = df.loc[df[flux_col] < 0]

        dfs.append(df)
    df = pd.concat(dfs)

    # site number column (that would be unique from other integers from other data sources)
    df['site_no'] = [f'iwum_{node}' for node in df.site_no]

    # project the data to a destination crs, if provided
    # make a separate metadata dataframe with 1 row per location
    # to avoid redundant operations
    metadata = df.groupby('site_no').first().reset_index()[['site_no', 'x', 'y']]
    metadata.index = metadata['site_no']
    x_pr, y_pr = project((metadata.x.values, metadata.y.values), nc_crs, dest_crs)
    metadata['x'], metadata['y'] = x_pr, y_pr
    metadata['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)]

    # cull the data to the model area, if provided
    if active_area is not None:
        df, metadata = cull_data_to_active_area(df, active_area,
                                      active_area_id_column,
                                      active_area_feature_id,
                                      data_crs=dest_crs, metadata=metadata)

    # update data with x,y values projected in metadata
    x = dict(zip(metadata.site_no, metadata.x))
    y = dict(zip(metadata.site_no, metadata.y))
    df['x'] = [x[sn] for sn in df.site_no]
    df['y'] = [y[sn] for sn in df.site_no]
    if outfile is not None:
        outfile = Path(outfile)
        df.to_csv(outfile, index=False, float_format='%g')
        print('wrote {}'.format(outfile))

        # Make a plot of iwum output in mgal/day
        out_pdf_path = outfile.parent / 'plots'
        out_pdf_path.mkdir(exist_ok=True)
        plot_iwum_output(ncfile, flux_variable=flux_variable, outpath=out_pdf_path)

    return df
コード例 #18
0
ファイル: obs.py プロジェクト: aleaf/usgs-map-gwmodels
def preprocess_obs(
    data,
    metadata=None,
    data_columns=['flow'],
    start_date=None,
    active_area=None,
    active_area_id_column=None,
    active_area_feature_id=None,
    source_crs=4269,
    dest_crs=5070,
    datetime_col='datetime',
    site_no_col='site_no',
    line_id_col='line_id',
    x_coord_col='x',
    y_coord_col='y',
    name_col='name',
    qualifier_column=None,
    default_qualifier='measured',
    obstype='flow',
    include_sites=None,
    include_line_ids=None,
    source_length_units='ft',
    source_time_units='s',
    dest_length_units='m',
    dest_time_units='d',
    geographic_groups=None,
    geographic_groups_col=None,
    max_obsname_len=None,
    add_leading_zeros_to_sw_site_nos=False,
    column_renames=None,
    outfile=None,
):
    """Preprocess observation data, for example, from NWIS or another data source that
    outputs time series in CSV format with site locations and identifiers.

    * Data are reprojected from a `source_crs` (Coordinate reference system; assumed to be in geographic coordinates)
      to the CRS of the model (`dest_crs`)
    * Data are culled to a `start_date` and optionally, a polygon or set of polygons defining the model area
    * length and time units are converted to those of the groundwater model.
    * Prefixes for observation names (with an optional length limit) that identify the location are generated
    * Preliminary observation groups can also be assigned, based on geographic areas defined by polygons
      (`geographic_groups` parameter)

    Parameters
    ----------
    data : csv file or DataFrame
        Time series of observations.
        Columns:

        ===================== ======================================
        site_no               site identifier
        datetime              measurement dates/times
        x                     x-coordinate of site
        y                     y-coordinate of site
        data_columns          Columns of observed values
        qualifier_column      Optional column with qualifiers for values
        ===================== ======================================

        Notes:

        * x and y columns can alternatively be in the metadata table
        * data_columns are denoted in `data_columns`; multiple
          columns can be included to process base flow and total flow, or
          other statistics in tandem
        * For example, `qualifier_column` may have "estimated" or "measured"
          flags denoting whether streamflows were derived from measured values
          or statistical estimates.

    metadata : csv file or DataFrame
        Observation site information.

        May include columns:

        ================= ================================================================================
        site_no           site identifier
        x                 x-coordinate of site
        y                 y-coordinate of site
        name              name of site
        line_id_col       Identifier for a line in a hydrography dataset that the site is associated with.
        ================= ================================================================================

        Notes:

        * other columns in metadata will be passed through to the metadata output

    data_columns : list of strings
        Columns in data with values or their statistics.
        By default, ['q_cfs']
        start_date : str (YYYY-mm-dd)
        Simulation start date (cull observations before this date)
    active_area : str
        Shapefile with polygon to cull observations to. Automatically reprojected
        to dest_crs if the shapefile includes a .prj file.
        by default, None.
    active_area_id_column : str, optional
        Column in active_area with feature ids.
        By default, None, in which case all features are used.
    active_area_feature_id : str, optional
        ID of feature to use for active area
        By default, None, in which case all features are used.
    source_crs : obj
        Coordinate reference system of the head observation locations.
        A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
        passed to :meth:`pyproj.crs.CRS.from_user_input`

        Can be any of:
          - PROJ string
          - Dictionary of PROJ parameters
          - PROJ keyword arguments for parameters
          - JSON string with PROJ parameters
          - CRS WKT string
          - An authority string [i.e. 'epsg:4326']
          - An EPSG integer code [i.e. 4326]
          - A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
          - An object with a `to_wkt` method.
          - A :class:`pyproj.crs.CRS` class

        By default, epsg:4269
    dest_crs : obj
        Coordinate reference system of the model. Same input types
        as ``source_crs``.
        By default, epsg:5070
    datetime_col : str, optional
        Column name in data with observation date/times,
        by default 'datetime'
    site_no_col : str, optional
        Column name in data and metadata with site identifiers,
        by default 'site_no'
    line_id_col : str, optional
        Column name in data or metadata with identifiers for
        hydrography lines associated with observation sites.
        by default 'line_id'
    x_coord_col : str, optional
        Column name in data or metadata with x-coordinates,
        by default 'x'
    y_coord_col : str, optional
        Column name in data or metadata with y-coordinates,
        by default 'y'
    name_col : str, optional
        Column name in data or metadata with observation site names,
        by default 'name'
    qualifier_column : str, optional
        Column name in data with observation qualifiers, such
        as "measured" or "estimated"
        by default 'category'
    default_qualifier : str, optional
        Default qualifier to populate qualifier_column if it
        is None. By default, "measured"
    obstype : str, optional
        Modflow-6 observation type (e.g. 'downstream-flow' or 'stage'). 
        The last part of the name (after the last hyphen) is used as a suffix in the output 
        ``obsprefix`` column. E.g. 07275000-flow for downstream or upstream-flow at site 07275000.
        By default, 'flow'
    include_sites : list-like, optional
        Exclude output to these sites.
        by default, None (include all sites)
    include_line_ids : list-like, optional
        Exclude output to these sites, represented by line identifiers.
        by default, None (include all sites)
    source_length_units : str, 'm3', 'm', 'cubic meters', 'ft3', etc.
        Length or volume units of the source data. By default, 'ft3'
    source_time_units : str, 's', 'seconds', 'days', etc.
        Time units of the source data. By default, 's'
    dest_length_units : str, 'm3', 'cubic meters', 'ft3', etc.
        Length or volume units of the output (model). By default, 'm'
    dest_time_units : str, 's', 'seconds', 'days', etc.
        Time units of the output (model). By default, 'd'
    geographic_groups : file, dict or list-like
        Option to group observations by area(s) of interest. Can
        be a shapefile, list of shapefiles, or dictionary of shapely polygons.
        A 'group' column will be created in the metadata, and observation
        sites within each polygon will be assigned the group name
        associated with that polygon.

        For example::

            geographic_groups='../source_data/extents/CompositeHydrographArea.shp'
            geographic_groups=['../source_data/extents/CompositeHydrographArea.shp']
            geographic_groups={'cha': <shapely Polygon>}

        Where 'cha' is an observation group name for observations located within the
        the area defined by CompositeHydrographArea.shp. For shapefiles,
        group names are provided in a `geographic_groups_col`.

    geographic_groups_col : str
        Field name in the `geographic_groups` shapefile(s) containing the
        observation group names associated with each polygon.
    max_obsname_len : int or None
        Maximum length for observation name prefix. Default of 13
        allows for a PEST obsnme of 20 characters or less with
        <prefix>_yyyydd or <prefix>_<per>d<per>
        (e.g. <prefix>_2d1 for a difference between stress periods 2 and 1)
        If None, observation names will not be truncated. PEST++ does not have
        a limit on observation name length.
    add_leading_zeros_to_sw_site_nos : bool
        Whether or not to pad site numbers using the
        :func:~`mapgwm.swflows.format_usgs_sw_site_id` function.
        By default, False.
    column_renames : dict, optional
        Option to rename columns in the data or metadata that are different than those listed above.
        For example, if the data file has a 'SITE_NO' column instead of 'SITE_BADGE'::

            column_renames={'SITE_NO': 'site_no'}

        by default None, in which case the renames listed above will be used.
        Note that the renames must be the same as those listed above for
        :func:`mapgwm.swflows.preprocess_obs` to work.
    outfile : str
        Where output file will be written. Metadata are written to a file
        with the same name, with an additional "_info" suffix prior to
        the file extension.

    Returns
    -------
    data : DataFrame
        Preprocessed time series
    metadata : DataFrame
        Preprocessed metadata

    References
    ----------
    `The PEST++ Manual <https://github.com/usgs/pestpp/tree/master/documentation>`

    Notes
    -----

    """
    # outputs
    if outfile is not None:
        outpath, filename = os.path.split(outfile)
        makedirs(outpath)
        outname, ext = os.path.splitext(outfile)
        out_info_csvfile = outname + '_info.csv'
        out_data_csvfile = outfile
        out_shapefile = outname + '_info.shp'

    # read the source data
    if not isinstance(data, pd.DataFrame):
        df = pd.read_csv(data, dtype={site_no_col: object})
    else:
        df = data.copy()
    # check the columns
    for col in [datetime_col] + data_columns:
        assert col in df.columns, "Column {} not found in {}".format(col, data)
    assert any({site_no_col, line_id_col}.intersection(df.columns)), \
        "Neither {} or {} found in {}. Need to specify a site_no_col or line_id_col".format(site_no_col,
                                                                                            line_id_col, data)
    # rename input columns to these names,
    # for consistent output
    dest_columns = {
        datetime_col: 'datetime',
        site_no_col: 'site_no',
        line_id_col: 'line_id',
        x_coord_col: 'x',
        y_coord_col: 'y',
        name_col: 'name',
        qualifier_column: 'category'
    }
    # update the default column renames
    # with any supplied via column_renames parameter
    if isinstance(column_renames, collections.Mapping):
        dest_columns.update(column_renames)
    df.rename(columns=dest_columns, inplace=True)
    data_columns = [
        c if c not in dest_columns else dest_columns[c] for c in data_columns
    ]
    # convert site numbers to strings;
    # add leading 0s to any USGS sites that should have them
    if 'site_no' in df.columns:
        df['site_no'] = format_site_ids(df['site_no'],
                                        add_leading_zeros_to_sw_site_nos)
    else:
        df['site_no'] = df[line_id_col]

    # make obsprefix names with site and observation type
    df['obsprefix'] = [
        f"{site_no}-{obstype.split('-')[-1]}" for site_no in df['site_no']
    ]

    # read the source data
    if metadata is not None:
        if not isinstance(metadata, pd.DataFrame):
            md = pd.read_csv(metadata, dtype={site_no_col: object})
        else:
            md = metadata.copy()
        if site_no_col not in md.columns or 'site_no' not in df.columns:
            raise IndexError(
                'If metadata are supplied, both data and metadata must '
                'have a site_no column.')
        md.rename(columns=dest_columns, inplace=True)
        md['site_no'] = format_site_ids(md['site_no'],
                                        add_leading_zeros_to_sw_site_nos)
        md.index = md['site_no']
        by_site = df.groupby('site_no')
        md['start_dt'] = pd.DataFrame(by_site['datetime'].first())
    else:
        by_site = df.groupby('site_no')
        md = pd.DataFrame(by_site['datetime'].first())
        md.columns = ['start_dt']
        md['site_no'] = md.index

    md['end_dt'] = pd.DataFrame(by_site['datetime'].last())
    md['n'] = pd.DataFrame(by_site['datetime'].count())
    md.reset_index(inplace=True, drop=True)

    # assign metadata if supplied
    for col in 'x', 'y', 'line_id', 'name':
        if col in df.columns and col not in md.columns:
            by_site_no = dict(zip(df['site_no'], df[col]))
            md[col] = [by_site_no[sn] for sn in md['site_no']]
            if col != 'line_id':
                df.drop(col, axis=1, inplace=True)

    # index the dataframe to times;
    # truncate data before start date
    df.index = pd.to_datetime(df['datetime'])
    df.index.name = 'datetime'
    df = df.loc[start_date:].copy()

    # project x, y to model crs
    x_pr, y_pr = project((md.x.values, md.y.values), source_crs, dest_crs)
    md['x'], md['y'] = x_pr, y_pr
    md['geometry'] = [Point(x, y) for x, y in zip(x_pr, y_pr)]

    # cull data to that within the model area
    if active_area is not None:
        df, md = cull_data_to_active_area(df,
                                          active_area,
                                          active_area_id_column,
                                          active_area_feature_id,
                                          data_crs=dest_crs,
                                          metadata=md)

    # get the hydrography IDs corresponding to each site
    # using the included lookup table
    #if 'line_id' not in df.columns:
    #    assert line_id_lookup is not None, \
    #    "need to include line_ids in a column, or line_id_lookup dictionary mapping line_ids to site numbers"
    #    df = df.loc[df['site_no'].isin(line_id_lookup)].copy()
    #    df['line_id'] = [line_id_lookup[sn] for sn in df['site_no']]

    if include_sites is not None:
        md = md.loc[md.site_no.isin(include_sites)]
        df = df.loc[df.site_no.isin(include_sites)]
    if include_line_ids is not None:
        md = md.loc[md.line_id.isin(include_line_ids)]
        df = df.loc[df.line_id.isin(include_line_ids)]

    # convert units
    # ensure that values are numeric (may be objects if taken directly from NWIS)
    if obstype == 'stage':
        unit_conversion = convert_length_units(source_length_units,
                                               dest_length_units)
    else:
        unit_conversion = (
            convert_volume_units(source_length_units, dest_length_units) /
            convert_time_units(source_time_units, dest_time_units))
    for obs_col in data_columns:
        df[obs_col] = pd.to_numeric(df[obs_col],
                                    errors='coerce') * unit_conversion
    df.dropna(subset=data_columns, axis=0, inplace=True)

    # reformat qualifiers for consistent output
    # (lump to dest category columns of either estimated or measured)
    # with measured including values derived from baseflow separation or actual measurements)
    # output column name for qualifier column:
    dest_qualifier_column = 'category'
    if qualifier_column is not None:
        qualifiers = {
            'calculated': 'measured',  # 'measured',
            'base flow separated from measured values':
            'measured',  # 'measured',
            'measured total flow': 'measured',
            'estimated gaged': 'estimated',
            'estimated ungaged': 'estimated'
        }
        df[dest_qualifier_column] = df[qualifier_column].replace(qualifiers)
    else:
        df['category'] = default_qualifier

    # make unique n-character prefixes (site identifiers) for each observation location
    # 13 character length allows for prefix_yyyymmm in 20 character observation names
    # (BeoPEST limit)
    unique_obsnames = set()
    obsnames = []
    for sn in md['site_no'].tolist():
        if max_obsname_len is not None:
            name = make_obsname(sn,
                                unique_names=unique_obsnames,
                                maxlen=max_obsname_len)
            assert name not in unique_obsnames
        else:
            name = sn
        name = name + f"-{obstype.split('-')[-1]}"
        unique_obsnames.add(name)
        obsnames.append(name)
    md['obsprefix'] = obsnames

    # add area of interest information
    md['group'] = 'fluxes'
    md = assign_geographic_obsgroups(md,
                                     geographic_groups,
                                     geographic_groups_col,
                                     metadata_crs=dest_crs)

    # data columns
    data_cols = ['site_no', 'line_id', 'datetime', 'obsprefix'
                 ] + data_columns + ['category']
    #if 'line_id' in md.columns and 'line_id' not in df.columns:
    #    # only map line_ids to data if there are more site numbers
    #    # implying that no site number maps to more than one line_id
    #    if len(set(df.site_no)) >= len(set(df.line_id)):
    #        ids = dict(zip(md['site_no'], md['line_id']))
    #    df['line_id'] = [ids[sn] for sn in df['site_no']]
    data_cols = [c for c in data_cols if c in df.columns]
    df = df[data_cols]

    md.index = md['site_no']
    # save out the results
    if outfile is not None:
        df2shp(md.drop(['x', 'y'], axis=1), out_shapefile, crs=dest_crs)
        print('writing {}'.format(out_info_csvfile))
        md.drop('geometry', axis=1).to_csv(out_info_csvfile,
                                           index=False,
                                           float_format='%g')
        print('writing {}'.format(out_data_csvfile))
        df.to_csv(out_data_csvfile, index=False, float_format='%g')
    return df, md