Beispiel #1
0
def get_lake_stage_obs(lake_obs_files,
                       perioddata,
                       observed_values_file,
                       lake_site_numbers,
                       outfile=None,
                       variable_name='stage',
                       observed_values_site_id_col='obsprefix',
                       observed_values_obsval_col='measured',
                       write_ins=True):
    # read in the observed values and site locations
    if not isinstance(observed_values_file, pd.DataFrame):
        observed = pd.read_csv(observed_values_file)
    else:
        observed = observed_values_file
    observed.index = observed['obsnme']

    dfs = []
    for name, f in lake_obs_files.items():
        df = read_mf6_lake_obs(f, perioddata)
        df.reset_index(inplace=True)  # put index into datetime column

        # add obsnames
        steady = perioddata.steady.values
        prefix = '{}_lk'.format(lake_site_numbers[name])
        obsnme = []
        group = []
        for per, dt in zip(df.kper, df.datetime):
            if steady[per]:
                obsnme.append(prefix + '_ss')
            else:
                obsnme.append('{}_{}'.format(prefix, dt.strftime('%Y%m')))
        df['obsnme'] = obsnme
        df.index = df.obsnme
        df['obsprefix'] = prefix
        df['name'] = name
        sim_values_column = 'sim_' + variable_name
        obs_values_column = 'obs_' + variable_name
        df[obs_values_column] = observed[observed_values_obsval_col]
        df['group'] = observed['group']
        # rename columns for consistency with other obs
        renames = {'stage': sim_values_column, 'kper': 'per'}
        df.rename(columns=renames, inplace=True)
        # drop values that don't have an observation
        df.dropna(subset=[obs_values_column], axis=0, inplace=True)
        dfs.append(df)
    df = pd.concat(dfs)

    # write output
    if outfile is not None:
        df.to_csv(outfile, sep=' ', index=False)
        print(f'wrote {len(df)} observations to {outfile}')

        # write the instruction file
        if write_ins:
            write_insfile(df,
                          outfile + '.ins',
                          obsnme_column='obsnme',
                          simulated_obsval_column=sim_values_column,
                          index=False)
    return df
Beispiel #2
0
def get_modflow_mass_balance(modroot, outfile=None, write_ins=True):
    """
    read in the percent discrepancy for inset and parent models

    Parameters
    ----------
    modroot: root name of the model scenario
    outfile: filepath for output
    write_ins: bool. whether or not to write instruction file
    """
    print('reading in the mass balance files')
    # make a list with which to concatenate results
    dfs = []
    # read in both inset and parent list files
    for cmod in ['inset', 'parent']:
        # read in the list files
        mfl6 = fp.utils.Mf6ListBudget("{0}{1}_{2}.list".format(
            rundir, modroot, cmod))
        # get all the budget information
        df, _ = mfl6.get_dataframes(start_datetime="1-1-2012")
        # construct the obsname with the date etc.
        df['obsnme'] = [
            '{0}_discrep_{1:d}{2:02d}'.format(cmod, i.year, i.month)
            for i in df.index
        ]
        # append on the max absolute percent discrepancy
        df = df.append(
            {
                'obsnme': '{}_discrep_max'.format(cmod),
                'PERCENT_DISCREPANCY': df.PERCENT_DISCREPANCY.abs().max()
            },
            ignore_index=True)
        dfs.append(df[['obsnme', 'PERCENT_DISCREPANCY']])
    outdf = pd.concat(dfs)
    outdf['group'] = 'percent_discrep'
    outdf['obsval'] = 0
    outdf.to_csv(outfile, index=False, sep=' ')
    print(f'wrote {len(outdf):,} observations to {outfile}')
    if write_ins:
        write_insfile(outdf,
                      outfile + '.ins',
                      obsnme_column='obsnme',
                      simulated_obsval_column='PERCENT_DISCREPANCY',
                      index=False)
Beispiel #3
0
def get_flux_obs(perioddata,
                 model_output_file='meras3_1L.sfr.obs.output.csv',
                 observed_values_file='../tables/flux_obs.csv',
                 observed_values_metadata_file=None,
                 variable_name='flux',
                 observed_values_site_id_col='obsprefix',
                 observed_values_datetime_col='datetime',
                 obsnme_date_suffix=True,
                 obsnme_suffix_format='%Y%m',
                 observed_values_obsval_col='obsval',
                 observed_values_group_column='obgnme',
                 observed_values_unc_column='uncertainty',
                 aggregrate_observed_values_by='mean',
                 drop_groups=None,
                 label_period_as_steady_state=None,
                 steady_state_period_start=None,
                 steady_state_period_end=None,
                 outfile=None,
                 write_ins=False):
    """[summary]

    Parameters
    ----------
    perioddata : str
        Path to csv file with start/end dates for stress periods. Must have columns
        'time' (modflow time, in days), 'start_datetime' (start date for the stress period)
        and 'end_datetime' (end date for the stress period).
    model_output_file : str, optional
        [description], by default 'meras3_1L.sfr.obs.output.csv'
    observed_values_file : str, optional
        [description], by default '../tables/flow_obs_by_stress_period.csv'
    observed_values_column : str, optional
        Column in obs_values_file with measured flux values
    obsnme_date_suffix : bool
        If true, give observations a date-based suffix. Otherwise, assign a 
        stress period-based suffix. In either case, the format of the suffix
        is controlled by obsnme_suffix_format.
        by default True
    obsnme_suffix_format : str, optional
        Format for suffix of obsnmes. Observation names are created following the format of
        <obsprefix>_<date or stress period suffix>. By default, ``'%Y%m'``,
        which would yield ``'202001'`` for a Jan, 2020 observation 
        (obsnme_date_suffix=True). If obsnme_date_suffix=False, obsnme_suffix_format
        should be a decimal format in the "new-style" string format
        (e.g. '{:03d}', which would yield ``'001'`` for stress period 1.)
    variable_name : str, optional
        [description], by default 'measured'
    outfile : str, optional
        [description], by default 'processed_flux_obs.dat'
    write_ins : bool, optional
        [description], by default False

    Returns
    -------
    [type]
        [description]
    """
    # validation checks
    check_obsnme_suffix(obsnme_date_suffix,
                        obsnme_suffix_format,
                        function_name='get_head_obs')

    outpath = Path('.')
    if outfile is not None:
        outpath = Path(outfile).parent

    obs_values_column = 'obs_' + variable_name  # output column with observed values
    sim_values_column = 'sim_' + variable_name  # output column with simulated equivalents to observed values

    perioddata = perioddata.copy()
    set_period_start_end_dates(perioddata)
    perioddata.index = perioddata.per

    results = get_mf6_single_variable_obs(
        perioddata,
        model_output_file=model_output_file,
        variable_name=variable_name,
        obsnme_date_suffix=obsnme_date_suffix,
        obsnme_suffix_format=obsnme_suffix_format,
        label_period_as_steady_state=label_period_as_steady_state)

    # rename columns to their defaults
    renames = {  #observed_values_site_id_col: 'obsprefix',
        observed_values_datetime_col: 'datetime',
        observed_values_group_column: 'obgnme',
        observed_values_unc_column: 'uncertainty'
    }

    if not isinstance(observed_values_file, pd.DataFrame):
        observed = pd.read_csv(observed_values_file,
                               dtype={observed_values_site_id_col: object})
    else:
        observed = observed_values_file
    observed.rename(columns=renames, inplace=True)
    if 'obsprefix' not in observed.columns:
        observed['obsprefix'] = observed[observed_values_site_id_col]
    #observed.index = observed['obsnme']

    # read in the observed values metadata
    if observed_values_metadata_file is not None:
        if not isinstance(observed_values_metadata_file, pd.DataFrame):
            metadata = pd.read_csv(observed_values_metadata_file,
                                   dtype={observed_values_site_id_col: object})
        else:
            metadata = observed_values_metadata_file
        metadata.rename(columns=renames, inplace=True)
        if 'obsprefix' not in metadata.columns:
            metadata['obsprefix'] = metadata[observed_values_site_id_col]

        # join the metadata to the observed data
        metadata.index = metadata['obsprefix'].values
        observed.index = observed['obsprefix'].values
        join_cols = [
            c for c in ['screen_top', 'screen_botm', 'x', 'y', 'layer']
            if c in metadata.columns
        ]
        observed = observed.join(metadata[join_cols])

    # convert obs names and prefixes to lower case
    observed['obsprefix'] = observed['obsprefix'].str.lower()

    # cast datetimes to pandas datetimes
    observed['datetime'] = pd.to_datetime(observed['datetime'])
    observed['steady'] = False  # flag for steady-state observations

    # drop model results that aren't in the obs information file
    # these are probably observations that aren't in the model time period
    # (and therefore weren't included in the parent model calibration;
    # but modflow-setup would include them in the MODFLOW observation input)
    # also drop sites that are in the obs information file, but not in the model results
    # these include sites outside of the model (i.e. in the inset when looking at the parent)
    no_info_sites = set(results.obsprefix).symmetric_difference(
        observed.obsprefix)
    # dump these out to a csv
    print('Dropping {} sites with no information'.format(len(no_info_sites)))
    dropped_obs_outfile = outpath / 'dropped_head_observation_sites.csv'
    results.loc[results.obsprefix.isin(no_info_sites)].to_csv(
        dropped_obs_outfile, index=False)
    results = results.loc[~results.obsprefix.isin(no_info_sites)].copy()
    observed = observed.loc[~observed.obsprefix.isin(no_info_sites)].copy()

    # for each model stress period, get the simulated values
    # and the observed equivalents
    observed.index = pd.to_datetime(observed.datetime)
    periods = results.groupby('per')
    observed_simulated_combined = []
    for per, data in periods:

        # get the equivalent observed values
        start, end = perioddata.loc[per, ['start_datetime', 'end_datetime']]
        # date-based suffix
        if obsnme_date_suffix:
            suffix = pd.Timestamp(end).strftime(obsnme_suffix_format)
        # stress period-based suffix
        else:
            suffix = f"{per:{obsnme_suffix_format.strip('{:}')}}"

        # steady-state observations can represent a period
        # other than the "modflow time" in the perioddata table
        if per == label_period_as_steady_state:
            suffix = 'ss'
            if steady_state_period_start is not None:
                start = steady_state_period_start
            if steady_state_period_end is not None:
                end = steady_state_period_end
        observed_in_period = observed.sort_index().loc[start:end].reset_index(
            drop=True)
        if len(observed_in_period) == 0:
            warnings.warn(
                ('Stress period {}: No observations between start and '
                 'end dates of {} and {}!'.format(per, start, end)))
            continue
        observed_in_period.sort_values(by=['obsprefix', 'datetime'],
                                       inplace=True)
        if 'n' not in observed_in_period.columns:
            observed_in_period['n'] = 1
        by_site = observed_in_period.groupby('obsprefix')
        observed_in_period_rs = getattr(by_site,
                                        aggregrate_observed_values_by)()
        observed_in_period_rs['n'] = by_site.n.sum()
        observed_in_period_rs['datetime'] = pd.Timestamp(end)
        observed_in_period_rs.reset_index(inplace=True)  # put obsprefix back

        missing_cols = set(observed_in_period.columns).difference(
            observed_in_period_rs.columns)
        for col in missing_cols:
            observed_in_period_rs[col] = by_site[col].first().values
        observed_in_period_rs = observed_in_period_rs[
            observed_in_period.columns]
        obsnames = [
            '{}_{}'.format(prefix.lower(), suffix)
            for prefix in observed_in_period_rs.obsprefix
        ]
        observed_in_period_rs['obsnme'] = obsnames
        observed_in_period_rs.index = observed_in_period_rs['obsnme']

        # get the simulated equivalents
        any_simulated_obs = data.obsnme.isin(
            observed_in_period_rs.obsnme).any()
        if not any_simulated_obs:
            continue
        sim_values = []
        #for obsnme, layer in zip(observed_in_period_rs.obsnme, observed_in_period_rs.layer):
        #    obsnme_results = data.loc[obsnme]
        #    # if a DataFrame (with simulated values for multiple layers) is returned
        #    if len(obsnme_results.shape) == 2:
        #        layer = obsnme_results.iloc[np.argmin(obsnme_results.layer - layer)]['layer']
        #        sim_value = obsnme_results.iloc[layer][sim_values_column]
        #    # Series (row) in results DataFrame with single simulated value
        #    else:
        #        sim_value = obsnme_results[sim_values_column]
        #    sim_values.append(sim_value)
        observed_in_period_rs[sim_values_column] = data.reindex(
            observed_in_period_rs.index)[sim_values_column]

        # add stress period and observed values
        observed_in_period_rs['per'] = per
        observed_in_period_rs[obs_values_column] = observed_in_period_rs[
            observed_values_obsval_col]
        observed_simulated_combined.append(observed_in_period_rs)

    # Combined DataFrame of observed heads and simulated equivalents
    obsdata = pd.concat(observed_simulated_combined)

    # raise an error if there are duplicates- reindexing below will fail if this is the case
    if obsdata.index.duplicated().any():
        msg = (
            'The following observations have duplicate names. There should only be'
            'one observation per site, for each time period implied by the '
            'obsnme_date_suffix_format parameter.\n{}'.format(
                obsdata.loc[obsdata.duplicated()]))
        raise ValueError(msg)

    # drop any observations in specified groups
    # (e.g. lake stages that should be compared with lake package output)
    if drop_groups is not None and 'obgnme' in obsdata.columns:
        obsdata = obsdata.loc[~obsdata.obgnme.isin(drop_groups)].copy()

    # nans are where sites don't have observation values for that period
    # or sites that are in other model (inset or parent)
    obsdata.dropna(subset=[obs_values_column], axis=0, inplace=True)

    # add standard obsval and obgmne columns
    obsdata['obsval'] = obsdata[obs_values_column]
    if 'obgnme' not in obsdata.columns:
        obsdata['obgnme'] = variable_name

    # reorder the columns
    columns = [
        'datetime', 'per', 'obsprefix', 'obsnme', obs_values_column,
        sim_values_column, 'uncertainty', 'obsval', 'obgnme'
    ]
    columns = [c for c in columns if c in obsdata.columns]
    obsdata = obsdata[columns].copy()
    if 'layer' in columns:
        obsdata['layer'] = obsdata['layer'].astype(int)

    # fill NaT (not a time) datetimes
    fill_nats(obsdata, perioddata)

    obsdata.sort_values(by=['obsprefix', 'per'], inplace=True)
    if outfile is not None:
        obsdata.fillna(-9999).to_csv(outfile, sep=' ', index=False)
        print(f'wrote {len(obsdata):,} observations to {outfile}')

        # write the instruction file
        if write_ins:
            write_insfile(obsdata,
                          str(outfile) + '.ins',
                          obsnme_column='obsnme',
                          simulated_obsval_column=sim_values_column,
                          index=False)
    return obsdata
Beispiel #4
0
def get_temporal_differences(base_data,
                             perioddata,
                             obs_values_col='obs_head',
                             sim_values_col='sim_head',
                             obstype='head',
                             get_displacements=False,
                             displacement_from=None,
                             obsnme_date_suffix=True,
                             obsnme_suffix_format='%Y%m',
                             exclude_suffix='ss',
                             exclude_obs=None,
                             outfile=None,
                             write_ins=False):
    """Takes the base_data dataframe output by :func:`mfobs.obs.get_obs`,
    creates temporal difference observations. Optionally writes an output csvfile
    and a PEST instruction file.

    Parameters
    ----------
    base_data : DataFrame
        Head observation data with same column structure as
        output from :func:`mfobs.obs.get_obs`
    perioddata : DataFrame
        DataFrame with start/end dates for stress periods. Must have columns
        'time' (modflow time, in days), 'start_datetime' (start date for the stress period)
        and 'end_datetime' (end date for the stress period).
    obs_values_col : str
        Column in ``base_data`` with observed values
    sim_values_col : str
        Column in `base_data`` with simulated equivalent values
    obstype : str  {'head', 'flux', or other}
        Type of observation being processed. Simulated and observed values
        columns are named in the format 'sim_<obstype>' and 'obs_<obstype>',
        respectively. If there is no 'obgnme' column in ``base_data``,
        ``obstype`` is also used as a default base group name.
    get_displacements : bool
        If True, compute the displacement of each observation from 
        a datum (specified by ``displacement_from``). If False, difference
        each observation with the previous observation.
        by default, False
    displacement_from : str or date-like
        Datum for computing displacements. Must be in a format that can be
        used for time slicing in pandas (e.g. '2010-01-01', which would result
        in displacements from the first observation on or after '2010-01-01' at each site,
        or None, which would result in displacements from the first observation 
        at each site. By default, None
    non-zero weighted observation
    obsnme_date_suffix : bool
        If true, give observations a date-based suffix. Otherwise, assign a 
        stress period-based suffix. In either case, the format of the suffix
        is controlled by obsnme_suffix_format.
        by default True
    obsnme_suffix_format : str, optional
        Format for suffix of obsnmes. Observation names are created following the format of
        <obsprefix>_<date or stress period suffix>. By default, ``'%Y%m'``,
        which would yield ``'202001'`` for a Jan, 2020 observation 
        (obsnme_date_suffix=True). If obsnme_date_suffix=False, obsnme_suffix_format
        should be a decimal format in the "new-style" string format
        (e.g. '{:03d}', which would yield ``'001'`` for stress period 1.)
    exclude_suffix : str or list-like
        Option to exclude observations from differencing by suffix;
        e.g. 'ss' to include steady-state observations.
        By default, 'ss'
    exclude_obs : list-like
        Sequence of observation names to exclude from return/written dataset. For example,
        if sequential head differences are also being computed, the first displacement observation
        after the reference observation will be a duplicate of the first sequential head difference
        observation. By default, None (no observations excluded).
    outfile : str, optional
        CSV file to write output to.
        By default, None (no output written)
    write_ins : bool, optional
        Option to write instruction file, by default False

    Returns
    -------
    period_diffs : DataFrame

    Notes
    -----
    Differences are computed by subtracting the previous time from the current,
    so a positive value indicates an increase.
    """
    # validation checks
    check_obsnme_suffix(obsnme_date_suffix,
                        obsnme_suffix_format,
                        function_name='get_head_obs',
                        obsdata=base_data)

    # only compute differences on transient obs
    if isinstance(exclude_suffix, str):
        exclude_suffix = [exclude_suffix]
    suffix = [obsnme.split('_')[1] for obsnme in base_data.obsnme]
    keep = ~np.in1d(suffix, exclude_suffix)
    base_data = base_data.loc[keep].copy()

    # group observations by site (prefix)
    sites = base_data.groupby('obsprefix')
    period_diffs = []
    for site_no, values in sites:
        values = values.sort_values(by=['per']).copy()
        values.index = values['datetime']

        # compute the differences
        if get_displacements:
            values = values.loc[displacement_from:]

            # some sites may not have any measurements
            # after displacement datum; skip these
            if len(values) <= 1:
                continue
            values['obsval'] = values[obs_values_col] - \
                values[obs_values_col].iloc[0]
            values['sim_obsval'] = values[sim_values_col] - \
                values[sim_values_col].iloc[0]
            # assign np.nan to starting displacements (of 0)
            # (so they get dropped later on,
            # consistent with workflow for sequential difference obs)
            values['obsval'].iloc[0] = np.nan
            values['sim_obsval'].iloc[0] = np.nan
        else:
            values['obsval'] = values[obs_values_col].diff()
            values['sim_obsval'] = values[sim_values_col].diff()

        # name the temporal difference obs as
        # <obsprefix>_<obsname1 suffix>d<obsname2 suffix>
        # where the obsval = obsname2 - obsname1
        obsnme = []
        for i, (idx, r) in enumerate(values.iterrows()):
            obsname2_suffix = ''
            if i > 0:
                if get_displacements:
                    obsname_2_loc = 0
                else:
                    obsname_2_loc = i - 1
                # date-based suffixes
                if obsnme_date_suffix:
                    obsname2_suffix = values.iloc[obsname_2_loc] \
                    ['datetime'].strftime(obsnme_suffix_format)
                # stress period-based suffixes
                else:
                    per = values.iloc[obsname_2_loc]['per']
                    obsname2_suffix = f"{per:{obsnme_suffix_format.strip('{:}')}}"
            obsnme.append('{}d{}'.format(r.obsnme, obsname2_suffix))
        values['obsnme'] = obsnme

        # todo: is there a general uncertainty approach for temporal differences that makes sense?

        period_diffs.append(values)
    period_diffs = pd.concat(period_diffs).reset_index(drop=True)
    period_diffs['datetime'] = pd.to_datetime(period_diffs['datetime'])

    # name the temporal difference obs as
    # <obsprefix>_<obsname1 suffix>d<obsname2 suffix>
    # where the obsval = obsname2 - obsname1
    #obsnme = []
    #for i, r in period_diffs.iterrows():
    #    obsname2_suffix = ''
    #    if i > 0:
    #        if get_displacements:
    #            obsname_2_loc = 0
    #        else:
    #            obsname_2_loc = i - 1
    #        # date-based suffixes
    #        if obsnme_date_suffix:
    #            obsname2_suffix = period_diffs.loc[obsname_2_loc,
    #                                               'datetime'].strftime(obsnme_suffix_format)
    #        # stress period-based suffixes
    #        else:
    #            per = period_diffs.loc[obsname_2_loc, 'per']
    #            obsname2_suffix = f"{per:{obsnme_suffix_format.strip('{:}')}}"
    #    obsnme.append('{}d{}'.format(r.obsnme, obsname2_suffix))
    #period_diffs['obsnme'] = obsnme
    if 'obgnme' not in period_diffs.columns:
        period_diffs['obgnme'] = obstype

    if get_displacements:
        period_diffs['type'] = f'{obstype} displacement'
        period_diffs['obgnme'] = [f'{g}_disp' for g in period_diffs['obgnme']]
    else:
        period_diffs['type'] = f'temporal {obstype} difference'
        period_diffs['obgnme'] = [f'{g}_tdiff' for g in period_diffs['obgnme']]

    # drop some columns that aren't really valid; if they exist
    period_diffs.drop(['n'], axis=1, inplace=True, errors='ignore')

    # clean up columns
    cols = [
        'datetime', 'per', 'obsprefix', 'obsnme', f'obs_{obstype}',
        f'sim_{obstype}', 'screen_top', 'screen_botm', 'layer', 'obsval',
        'sim_obsval', 'obgnme', 'type'
    ]
    cols = [c for c in cols if c in period_diffs.columns]
    period_diffs = period_diffs[cols]

    # drop observations with no difference (first observations at each site)
    period_diffs.dropna(axis=0, subset=['obsval', 'sim_obsval'], inplace=True)

    # drop any excluded obs
    if exclude_obs is not None:
        exclude_obs = set(exclude_obs)
        print(
            f"dropping {len(exclude_obs)} observations specified with exclude_obs"
        )
        period_diffs = period_diffs.loc[~period_diffs['obsnme'].
                                        isin(exclude_obs)]

    # fill NaT (not a time) datetimes
    fill_nats(period_diffs, perioddata)

    if outfile is not None:
        period_diffs.fillna(-9999).to_csv(outfile, sep=' ', index=False)
        print(f'wrote {len(period_diffs):,} observations to {outfile}')

        # write the instruction file
        if write_ins:
            write_insfile(period_diffs,
                          str(outfile) + '.ins',
                          obsnme_column='obsnme',
                          simulated_obsval_column='sim_obsval',
                          index=False)
    return period_diffs
Beispiel #5
0
def get_spatial_differences(base_data,
                            perioddata,
                            difference_sites,
                            obs_values_col='obs_head',
                            sim_values_col='sim_head',
                            obstype='head',
                            use_gradients=False,
                            sep='-d-',
                            write_ins=False,
                            outfile=None):
    """Takes the base_data dataframe output by :func:`mfobs.obs.get_obs` and creates
    spatial difference observations. Optionally writes an output csvfile
    and a PEST instruction file.

    Parameters
    ----------
    base_data : DataFrame
        Table of preprocessed observations, such as that produced by
        :func:`mfobs.obs.get_obs`
    perioddata : DataFrame
        DataFrame with start/end dates for stress periods. Must have columns
        'time' (modflow time, in days), 'start_datetime' (start date for the stress period)
        and 'end_datetime' (end date for the stress period).
    difference_sites : dict
        Dictionary of site numbers (keys) and other site numbers to compare to (values).
        Values can be a string for a single site, a list of strings for multiple sites,
        or a string pattern contained in multiple site numbers;
        observations at the sites represented in the values will be compared to the observation
        at the site represented by the key, at times of coincident measurements. Differences
        are computed by subtracting the values site(s) from the key site, so for example,
        to represent a gain in streamflow as positive, the downstream site should be key site.
    obs_values_col : str
        Column in ``base_data`` with observed values
    sim_values_col : str
        Column in `base_data`` with simulated equivalent values
    obstype : str  {'head', 'flux', or other}
        Type of observation being processed. Simulated and observed values
        columns are named in the format 'sim_<obstype>' and 'obs_<obstype>',
        respectively. If there is no 'obgnme' column in ``base_data``,
        ``obstype`` is also used as a default base group name. Finally,
        a 'type' column is included in the output with the label
        'vertical <obstype> gradient' or '<obstype> difference', depending on whether
        ``use_gradients=True``.
    use_gradients : bool
        If True, compute vertical hydraulic gradients and use those for the
        observation values, if False, use differences. For this option,
        'screen_top' and 'screen_botm' columns are needed in ``base_data``.
        By default False.
    sep : str
        Separator in spatial difference obsnnames. For example, with
        sites "site1" and "site2" at time "202001", and sep='-d-', the obsnme
        would be "site1-d-site2_202001".
        by default, '-d-'
    outfile : str, optional
        CSV file to write output to. Nan values are filled with -9999.
        By default, None (no output written)
    write_ins : bool, optional
        Option to write instruction file, by default False

    Returns
    =======
    spatial_differences : DataFrame
        Spatial difference observations. Columns:

        ================= ===================================================================================
        datetime          observation date-time labels
        per               model stress period
        obsprefix         observation name prefix (site identifier)
        obsnme1           name of observation from keys of ``difference_sites``
        <obs_values_col>1 observed value associated with obsnme1
        <sim_values_col>1 simulated equivalent associated with obsnme1
        screen_top1       well screen top (elevation) associated with obsnme1*
        screen_botm1      well screen botm (elevation) associated with obsnme1*
        layer1            model layer associated with obsnme1*
        obsnme2           name of observation from value(s) in ``difference_sites`` (associated with obsnme1)
        <obs_values_col>2 observed value associated with obsnme2
        <sim_values_col>2 simulated equivalent associated with obsnme2
        screen_top2       well screen top (elevation) associated with obsnme2*
        screen_botm2      well screen botm (elevation) associated with obsnme2*
        layer2            model layer associated with obsnme2*
        obs_diff          observed difference between obsnme1 and obsnme2
        sim_diff          simulated equivalent difference between obsnme1 and obsnme2
        dz                distance between well screen midpoints for obsnme1 and obsnme2*
        obs_grad          observed vertical hydraulic gradient between obsnme1 and obsnme2*
        sim_grad          simulated equivalent vertical hydraulic gradient between obsnme1 and obsnme2*
        obgnme            observation group
        obsnme            spatial difference observation name
        obsval            observation value (i.e. for PEST control file)
        sim_obsval        simulated equivalent (i.e. for PEST instruction file)
        type              description of spatial difference observations
        uncertainty       (loosely) error-based uncertainty, assumed to be 2x that of obsnme2
        ================= ===================================================================================

        Notes:

        * * denotes optional columns that may not be present.
        * Columns relating to well open interval are only created if ``obstype='head'``
          and ``base_data`` has 'screen_top' and 'screen_botm' columns.
        * Negative difference or gradient values indicate a gradient towards the key site.

    """

    # model stress period data:
    perioddata = perioddata.copy()
    # make sure start and end dates don't overlap
    set_period_start_end_dates(perioddata)
    perioddata.index = perioddata.per

    # get subset of base_data sites to compare to each key site in difference_sites
    base_data_sites = set(base_data.obsprefix)
    groups = base_data.groupby('obsprefix')
    spatial_differences = []
    for key_site_no, patterns in difference_sites.items():

        if key_site_no not in base_data_sites:
            print((f'warning: site {key_site_no} not in base_data. '
                   'Skipping spatial differencing.'))
            continue

        compare = []
        if isinstance(patterns, str):
            patterns = [patterns]
        for pattern in patterns:
            matches = [
                True if pattern in site_name else False
                for site_name in base_data.obsprefix
            ]
            compare.append(matches)
        compare = np.any(compare, axis=0)
        sites = set(base_data.loc[compare, 'obsprefix'])

        # for each site in the subset, compare the values to the keys site
        # index by stress period
        key_values = groups.get_group(key_site_no).copy()
        key_values.index = key_values.per

        for obsprefix, site_observations in groups:
            if obsprefix in sites:
                site_obs = site_observations.copy()
                site_obs.rename(
                    columns={
                        obs_values_col: f"{obs_values_col}2",  # 'obs_head2',
                        sim_values_col: f"{sim_values_col}2",  # 'sim_head2',
                        'obsnme': 'obsnme2',
                        'screen_top': 'screen_top2',
                        'screen_botm': 'screen_botm2',
                        'layer': 'layer2'
                    },
                    inplace=True)
                site_obs.index = site_obs.per
                site_obs['obsnme1'] = key_values['obsnme']
                site_obs[f"{obs_values_col}1"] = key_values[obs_values_col]
                site_obs[f"{sim_values_col}1"] = key_values[sim_values_col]
                if 'screen_top' in key_values.columns:
                    site_obs['screen_top1'] = key_values['screen_top']
                if 'screen_botm' in key_values.columns:
                    site_obs['screen_botm1'] = key_values['screen_botm']
                if 'layer2' in site_obs.columns:
                    site_obs['layer1'] = key_values['layer']
                # negative values indicate gradient towards key site
                # (key site head < values site head)
                site_obs['obs_diff'] = site_obs[
                    f"{obs_values_col}1"] - site_obs[f"{obs_values_col}2"]
                site_obs['sim_diff'] = site_obs[
                    f"{sim_values_col}1"] - site_obs[f"{sim_values_col}2"]

                # get a screen midpoint and add gradient
                screen_midpoint1 = None
                if {'screen_top1',
                        'screen_botm1'}.intersection(site_obs.columns):
                    screen_midpoint1 = site_obs[[
                        'screen_top1', 'screen_botm1'
                    ]].mean(axis=1)
                if {'screen_top2',
                        'screen_botm2'}.intersection(site_obs.columns):
                    screen_midpoint2 = site_obs[[
                        'screen_top2', 'screen_botm2'
                    ]].mean(axis=1)
                    if screen_midpoint1 is not None:
                        site_obs['dz'] = (screen_midpoint1 - screen_midpoint2)
                        site_obs[
                            'obs_grad'] = site_obs['obs_diff'] / site_obs['dz']
                        site_obs[
                            'sim_grad'] = site_obs['sim_diff'] / site_obs['dz']
                spatial_differences.append(site_obs)
    spatial_differences = pd.concat(spatial_differences)
    spatial_differences.dropna(subset=['obs_diff', 'sim_diff'],
                               axis=0,
                               inplace=True)

    # name the spatial head difference obs as
    # <obsprefix1><sep><obsprefix2>_<suffix>
    obsnme = []
    obsprefix = []
    for i, r in spatial_differences.iterrows():
        prefix1, suffix1 = r.obsnme1.split('_')
        prefix2, suffix2 = r.obsnme2.split('_')

        assert suffix1 == suffix2, "Observations are at different times! {}, {}".format(
            r.obsnme1, r.obsnme2)
        prefix = '{}{}{}'.format(
            prefix1,
            sep,
            prefix2,
        )
        obsnme.append('{}_{}'.format(prefix, suffix2))
        obsprefix.append(prefix)
    spatial_differences['obsnme'] = obsnme
    spatial_differences['obsprefix'] = obsprefix
    if 'obgnme' not in spatial_differences.columns:
        spatial_differences['obgnme'] = obstype
    spatial_differences['obgnme'] = [
        '{}_sdiff'.format(g) for g in spatial_differences['obgnme']
    ]

    # clean up columns
    cols = [
        'datetime', 'per', 'obsprefix', 'obsnme1', f"{obs_values_col}1",
        f"{sim_values_col}1", 'screen_top1', 'screen_botm1', 'layer1',
        'obsnme2', f"{obs_values_col}2", f"{sim_values_col}2", 'screen_top2',
        'screen_botm2', 'layer2', 'obs_diff', 'sim_diff', 'dz', 'obs_grad',
        'sim_grad', 'obgnme', 'obsnme'
    ]
    cols = [c for c in cols if c in spatial_differences.columns]
    spatial_differences = spatial_differences[cols]

    # whether to use gradients for the obsvals, or just head differences
    if use_gradients:
        spatial_differences['obsval'] = spatial_differences['obs_grad']
        spatial_differences['sim_obsval'] = spatial_differences['sim_grad']
        obstype = f'{obstype} gradients'
    else:
        spatial_differences['obsval'] = spatial_differences['obs_diff']
        spatial_differences['sim_obsval'] = spatial_differences['sim_diff']
        obstype = f'spatial {obstype} difference'
    spatial_differences.dropna(axis=0, subset=['obsval'], inplace=True)
    spatial_differences['type'] = obstype

    # uncertainty column is from base_data;
    # assume that spatial head differences have double the uncertainty
    # (two wells/two measurements per obs)
    if 'uncertainty' in spatial_differences.columns:
        spatial_differences['uncertainty'] *= 2

    # check for duplicates
    assert not spatial_differences['obsnme'].duplicated().any()

    # fill NaT (not a time) datetimes
    fill_nats(spatial_differences, perioddata)

    if outfile is not None:
        spatial_differences.fillna(-9999).to_csv(outfile, sep=' ', index=False)
        print(f'wrote {len(spatial_differences):,} observations to {outfile}')

        # write the instruction file
        if write_ins:
            write_insfile(spatial_differences,
                          str(outfile) + '.ins',
                          obsnme_column='obsnme',
                          simulated_obsval_column='sim_obsval',
                          index=False)
    return spatial_differences
Beispiel #6
0
def get_head_obs(perioddata,
                 modelgrid_transform,
                 model_output_file,
                 observed_values_file,
                 gwf_obs_input_file,
                 observed_values_metadata_file=None,
                 variable_name='head',
                 observed_values_site_id_col='obsprefix',
                 observed_values_datetime_col='datetime',
                 obsnme_date_suffix=True,
                 obsnme_suffix_format='%Y%m',
                 observed_values_obsval_col='obsval',
                 observed_values_x_col='x',
                 observed_values_y_col='y',
                 observed_values_screen_top_col='screen_top',
                 observed_values_screen_botm_col='screen_botm',
                 observed_values_layer_col=None,
                 observed_values_group_column='obgnme',
                 observed_values_unc_column='uncertainty',
                 aggregrate_observed_values_by='mean',
                 drop_groups=None,
                 hk_arrays=None,
                 top_array=None,
                 botm_arrays=None,
                 label_period_as_steady_state=None,
                 steady_state_period_start=None,
                 steady_state_period_end=None,
                 write_ins=False,
                 outfile=None):
    """Post-processes model output to be read by PEST, and optionally,
    writes a corresponding PEST instruction file. Reads model output
    using get_mf6_single_variable_obs(). General paradigm is to include all model
    layers in the MODFLOW input for each observation, and then post-process the model
    results to a single value by computing a transmissivity-weighted average.

    Observation names to match observed values to their simulated equivalents are constructed
    in the format of <obsprefix>_<date suffix>, where obsprefix is a site identifier taken
    from the ``observed_values_site_id_col`` in ``observed_values_file``. In creating
    observation names for MODFLOW output, the column names in the observation CSV output
    are used for the prefixes. Therefore, the identifiers in ``observed_values_site_id_col``
    should correspond to observations in the MODFLOW observation input. The date suffix
    is formatted using the ``obsnme_date_suffix_format`` parameter, which is also
    passed to :func:`~mfobs.modflow.get_mf6_single_variable_obs` for assigning observation
    names to the MODFLOW observation output.

    Optionally, a model stress period can be labeled as steady-state (``label_period_as_steady_state``),
    representing average conditions over a time period bracked by a ``steady_state_period_start`` and
    ``steady_state_period_end``. In this case, the simulated values for the labeled stress period are
    matched to average values for the steady-state time period.

    Parameters
    ----------
    perioddata : str
        Path to csv file with start/end dates for stress periods. Must have columns
        'time' (modflow time, in days), 'start_datetime' (start date for the stress period)
        and 'end_datetime' (end date for the stress period).
    modelgrid_transform : str
        An `affine.Affine <https://github.com/sgillies/affine>`_ object describing the orientation
        of the model grid. Modflow-setup :class:`~mfsetup.grid.MFsetupGrid` have this attached
        via the :meth:`~mfsetup.grid.MFsetupGrid.transform` property. Example::

            modelgrid_transform=affine.Affine(1000.0, 0.0, 500955,
                                              0.0, -1000.0, 1205285)

        for a uniform spacing of 1000 and upper left corner of 500955, 1205285
        with a rotation of 45 degrees, counter-clockwise about the upper left corner::

            modelgrid_transform=affine.Affine(1000.0, 0.0, 500955,
                                              0.0, -1000.0, 1205285).rotation(45.)

        An ``affine.Affine`` instance can also be created from a
        `Modflow-setup <https://github.com/aleaf/modflow-setup>`_
        grid JSON file via the :func:`~mfobs.modflow.get_modelgrid_transform` function.

    model_output_file : str
        Modflow-6 head observation CSV output file.
        Read by :func:`~mfobs.modflow.get_mf6_single_variable_obs`.
    observed_values_file : str or DataFrame
        CSV file or DataFrame with observed values. Must have the following columns
        (default names are shown, other names can be specified with
        observed_values_**_col variables below):

        ============= ========================
        site_id       site identifier
        datetime      date/time of observation
        obsval        observed value
        ============= ========================

        can optionally include these columns, or this information can be supplied
        in an observed_values_metadata_file, which will be joined on site_id

        ============= ========================
        x             x location
        y             y location
        screen_top    screen top elevation
        screen_botm   screen bottom elevation
        ============= ========================

        If supplied, observation group and uncertainty information will be
        passed through to the output ``base_data`` DataFrame:

        ============= ==================================
        obgnme         observation group
        uncertainty   estimated measurement uncertainty
        ============= ==================================

        Locations and screen tops and bottoms are assumed to be in the same
        CRS and length units as the model.

    observed_values_metadata_file : str, optional
        Site information for the observed values timeseries. Should include a
        `site_id` column that is the same as observed_values_site_id_col, and any of
        the following columns that are not in the observed_values_file:

        ============= ========================
        x             x location
        y             y location
        screen_top    screen top elevation
        screen_botm   screen bottom elevation
        ============= ========================

    gwf_obs_input_file : str
        Input file to MODFLOW-6 observation utility (contains layer information).
    variable_name : str, optional
        Column with simulated output will be named "sim_<variable_name",
        by default 'head'
    observed_values_site_id_col : str, optional
        Column name in observed_values_file with site identifiers,
        by default 'obsprefix'
    observed_values_datetime_col : str, optional
        Column name in observed_values_file with observation date/times,
        by default 'datetime'
    obsnme_date_suffix : bool
        If true, give observations a date-based suffix. Otherwise, assign a 
        stress period-based suffix. In either case, the format of the suffix
        is controlled by obsnme_suffix_format.
        by default True
    obsnme_suffix_format : str, optional
        Format for suffix of obsnmes. Observation names are created following the format of
        <obsprefix>_<date or stress period suffix>. By default, ``'%Y%m'``,
        which would yield ``'202001'`` for a Jan, 2020 observation 
        (obsnme_date_suffix=True). If obsnme_date_suffix=False, obsnme_suffix_format
        should be a decimal format in the "new-style" string format
        (e.g. '{:03d}', which would yield ``'001'`` for stress period 1.)
    observed_values_obsval_col : str, optional
        Column name in observed_values_file with observed values,
        by default 'obsval'
    observed_values_x_col : str, optional
        Column name in observed_values_file with x-coordinates,
        by default 'x'
    observed_values_y_col : str, optional
        Column name in observed_values_file with y-coordinates,
        by default 'y'
    observed_values_screen_top_col : str, optional
        Column name in observed_values_file with screen top elevations,
        by default 'screen_top'
    observed_values_screen_botm_col : str, optional
        Column name in observed_values_file with screen bottom elevations,
        by default 'screen_botm'
    observed_values_layer_col : str, optional
        As an alternative to providing screen tops and bottoms, the model layer
        for each observation can be specified directly via a layer column
        of zero-based layer numbers.
        by default None
    observed_values_group_column : str, optional
        Column name in observed_values_file with observation group information.
        Passed through to output ``base_data`` DataFrame, otherwise not required.
        by default 'obgnme'
    observed_values_unc_column : str, optional
        Column name in observed_values_file with observation uncertainty values.
        Passed through to output ``base_data`` DataFrame, otherwise not required.
        by default 'uncertainty'
    aggregrate_observed_values_by : str
        Method for aggregating observed values to the model stress periods,
        if there are multiple observed values in a stress period. Can be any
        of the method calls on the pandas
        `Resampler <https://pandas.pydata.org/pandas-docs/stable/reference/resampling.html>`_
        object. By default, 'mean'
    drop_groups : sequence, optional
        Observation groups to exclude from output, by default None
    hk_arrays : list-like, optional
        File paths to text arrays with hydraulic conductivity values
        (ordered by model layer). Used in the transmissivity-weighted averaging.
        by default None
    top_array : str, optional
        File paths to text array with model top elevations.
        Used in the transmissivity-weighted averaging.
        by default None
    botm_arrays : str, optional
        File paths to text arrays with model cell bottom elevations.
        (ordered by model layer). Used in the transmissivity-weighted averaging.
        by default None
    label_period_as_steady_state : int, optional
        Zero-based model stress period where observations will be
        assigned the suffix 'ss' instead of a date suffix.
        By default, None, in which case all model output is assigned
        a date suffix based on the start date of the stress period.
        Passed to :func:`~mfobs.modflow.get_mf6_single_variable_obs`.
    steady_state_period_start : str, optional
        Start date for the period representing steady-state conditions.
        Observations between ``steady_state_period_start`` and ``steady_state_period_end``
        will be averaged to create additional observations with the suffix 'ss'.
        The steady-state averages will be matched to model output from the
        stress period specified by ``label_period_as_steady_state``.
        By default None, in which case no steady-state observatons are created.
    steady_state_period_end : str, optional
        End date for the period representing steady-state conditions.
        By default None, in which case no steady-state observatons are created.
    outfile : str, optional
        CSV file to write output to.
        By default, None (no output written)
    write_ins : bool, optional
        Option to write instruction file, by default False

    Returns
    -------
    base_data : DataFrame
        With the following columns:

        ===================== ====================================================
        datetime              pandas datetimes for the start of each stress period
        per                   model stress period
        obsprefix             observation site identifier
        obsnme                observation name based on format of <obsprefix>_'%Y%m'
        obs_<variable_name>   observed values
        sim_<variable_name>   simulated observation equivalents
        screen_top            screen top elevation
        screen_botm           screen bottom elevation
        ===================== ====================================================

        Example observation names:

        site1000_202001, for a Jan. 2020 observation at site1000 (obsnme_date_suffix=True)
        
        site1000_001, for a stress period 1 observation at site1000 (obsnme_date_suffix=False)

        a steady-state stress period specified with label_period_as_steady_state 
        is given the suffix of 'ss'
        e.g. site1000_ss

    Notes
    -----
    All observation names and observation prefixes are converted to lower case
    to avoid potential case issues.


    """
    # validation checks
    check_obsnme_suffix(obsnme_date_suffix,
                        obsnme_suffix_format,
                        function_name='get_head_obs')

    outpath = Path('.')
    if outfile is not None:
        outpath = Path(outfile).parent

    obs_values_column = 'obs_' + variable_name  # output column with observed values
    sim_values_column = 'sim_' + variable_name  # output column with simulated equivalents to observed values

    perioddata = perioddata.copy()
    set_period_start_end_dates(perioddata)
    perioddata.index = perioddata.per
    results = get_mf6_single_variable_obs(
        perioddata,
        model_output_file=model_output_file,
        gwf_obs_input_file=gwf_obs_input_file,
        variable_name=variable_name,
        obsnme_date_suffix=obsnme_date_suffix,
        obsnme_suffix_format=obsnme_suffix_format,
        label_period_as_steady_state=label_period_as_steady_state)

    # rename columns to their defaults
    renames = {
        observed_values_site_id_col: 'obsprefix',
        observed_values_datetime_col: 'datetime',
        observed_values_x_col: 'x',
        observed_values_y_col: 'y',
        observed_values_screen_top_col: 'screen_top',
        observed_values_screen_botm_col: 'screen_botm',
        observed_values_layer_col: 'layer',
        observed_values_group_column: 'obgnme',
        observed_values_unc_column: 'uncertainty'
    }

    if not isinstance(observed_values_file, pd.DataFrame):
        observed = pd.read_csv(observed_values_file,
                               dtype={observed_values_site_id_col: object})
    else:
        observed = observed_values_file
    observed.rename(columns=renames, inplace=True)

    # read in the observed values metadata
    if observed_values_metadata_file is not None:
        if not isinstance(observed_values_metadata_file, pd.DataFrame):
            metadata = pd.read_csv(observed_values_metadata_file,
                                   dtype={observed_values_site_id_col: object})
        else:
            metadata = observed_values_metadata_file
        metadata.rename(columns=renames, inplace=True)

        # join the metadata to the observed data
        metadata.index = metadata['obsprefix'].values
        observed.index = observed['obsprefix'].values
        join_cols = [
            c for c in ['screen_top', 'screen_botm', 'x', 'y', 'layer']
            if c in metadata.columns
        ]
        observed = observed.join(metadata[join_cols])

    # convert obs names and prefixes to lower case
    observed['obsprefix'] = observed['obsprefix'].str.lower()

    # cast datetimes to pandas datetimes
    observed['datetime'] = pd.to_datetime(observed['datetime'])
    observed['steady'] = False  # flag for steady-state observations

    # drop model results that aren't in the obs information file
    # these are probably observations that aren't in the model time period
    # (and therefore weren't included in the parent model calibration;
    # but modflow-setup would include them in the MODFLOW observation input)
    # also drop sites that are in the obs information file, but not in the model results
    # these include sites outside of the model (i.e. in the inset when looking at the parent)
    no_info_sites = set(results.obsprefix).symmetric_difference(
        observed.obsprefix)
    # dump these out to a csv
    if len(no_info_sites) > 0:
        print('Dropping {} sites with no information'.format(
            len(no_info_sites)))
        dropped_obs_outfile = outpath / 'dropped_head_observation_sites.csv'
        results.loc[results.obsprefix.isin(no_info_sites)].to_csv(
            dropped_obs_outfile, index=False)
        results = results.loc[~results.obsprefix.isin(no_info_sites)].copy()
        observed = observed.loc[~observed.obsprefix.isin(no_info_sites)].copy()

    # get_mf6_single_variable_obs returns values for each layer
    # collapse these into one value for each location, time
    # by taking the transmissivity-weighted average
    if observed_values_layer_col is None:
        hk = load_array(hk_arrays)
        top = load_array(top_array)
        botm = load_array(botm_arrays)

    # get the x and y location and open interval corresponding to each head observation
    x = dict(zip(observed['obsprefix'], observed['x']))
    y = dict(zip(observed['obsprefix'], observed['y']))
    results['x'] = [x[obsprefix] for obsprefix in results.obsprefix]
    results['y'] = [y[obsprefix] for obsprefix in results.obsprefix]

    # get head values based on T-weighted average of open interval
    if observed_values_layer_col is None:
        screen_top = dict(zip(observed['obsprefix'], observed['screen_top']))
        screen_botm = dict(zip(observed['obsprefix'], observed['screen_botm']))
        results['screen_top'] = [
            screen_top[obsprefix] for obsprefix in results.obsprefix
        ]
        results['screen_botm'] = [
            screen_botm[obsprefix] for obsprefix in results.obsprefix
        ]

    # for each model stress period, get the simulated values
    # and the observed equivalents
    observed.index = pd.to_datetime(observed.datetime)
    periods = results.groupby('per')
    observed_simulated_combined = []
    for per, data in periods:

        # get the equivalent observed values
        start, end = perioddata.loc[per, ['start_datetime', 'end_datetime']]
        # date-based suffix
        if obsnme_date_suffix:
            suffix = pd.Timestamp(end).strftime(obsnme_suffix_format)
        # stress period-based suffix
        else:
            suffix = f"{per:{obsnme_suffix_format.strip('{:}')}}"

        # steady-state observations can represent a period
        # other than the "modflow time" in the perioddata table
        if per == label_period_as_steady_state:
            suffix = 'ss'
            if steady_state_period_start is not None:
                start = steady_state_period_start
            if steady_state_period_end is not None:
                end = steady_state_period_end
        observed_in_period = observed.sort_index().loc[start:end].reset_index(
            drop=True)
        if len(observed_in_period) == 0:
            warnings.warn(
                ('Stress period {}: No observations between start and '
                 'end dates of {} and {}!'.format(per, start, end)))
            continue
        observed_in_period.sort_values(by=['obsprefix', 'datetime'],
                                       inplace=True)
        if 'n' not in observed_in_period.columns:
            observed_in_period['n'] = 1
        by_site = observed_in_period.groupby('obsprefix')
        observed_in_period_rs = getattr(by_site,
                                        aggregrate_observed_values_by)()
        observed_in_period_rs['n'] = by_site.n.sum()
        observed_in_period_rs['datetime'] = pd.Timestamp(end)
        observed_in_period_rs.reset_index(inplace=True)  # put obsprefix back

        missing_cols = set(observed_in_period.columns).difference(
            observed_in_period_rs.columns)
        for col in missing_cols:
            observed_in_period_rs[col] = by_site[col].first().values
        observed_in_period_rs = observed_in_period_rs[
            observed_in_period.columns]
        obsnames = [
            '{}_{}'.format(prefix.lower(), suffix)
            for prefix in observed_in_period_rs.obsprefix
        ]
        observed_in_period_rs['obsnme'] = obsnames
        observed_in_period_rs.index = observed_in_period_rs['obsnme']

        # get head values based on T-weighted average of open interval
        if observed_values_layer_col is None:
            # get a n layers x n sites array of simulated head observations
            data = data.reset_index(drop=True)
            heads_2d = data.pivot(columns='layer',
                                  values='sim_head',
                                  index='obsnme').T.values
            obsnme = data.pivot(columns='layer',
                                values='obsnme',
                                index='obsnme').index.tolist()

            # x, y, screen_top and screen_botm have one value for each site
            kwargs = {}
            for arg in 'x', 'y', 'screen_top', 'screen_botm':
                # pivot data to nsites rows x nlay columns
                # positions without data are filled with nans
                pivoted = data.pivot(columns='layer',
                                     values=arg,
                                     index='obsnme')
                # reduce pivoted data to just one value per site by taking the mean
                # (values should be the same across columns, which represent layers)
                kwargs[arg] = pivoted.mean(axis=1).values

            # get the transmissivity associated with each head obs
            T = get_transmissivities(heads_2d,
                                     hk,
                                     top,
                                     botm,
                                     modelgrid_transform=modelgrid_transform,
                                     **kwargs)

            # compute transmissivity-weighted average heads
            Tr_frac = T / T.sum(axis=0)
            Tr_frac_df = pd.DataFrame(Tr_frac.transpose())
            Tr_frac_df['obsnme'] = obsnme
            Tr_frac_df.to_csv(outpath / 'obs_layer_transmissivities.csv',
                              float_format='%.2f')
            mean_t_weighted_heads = np.nansum((heads_2d * Tr_frac), axis=0)

            # in some cases, the open interval might be mis-matched with the layering
            # for example, an open interval might be primarily in layer 4,
            # in a location where layer 5 is the only active layer
            # this would result in a mean_t_weighted_heads value of 0
            # (from the zero transmissivity in that layer)
            # fill these instances with the mean of any valid heads at those locations
            mean_heads = np.nanmean(heads_2d, axis=0)
            misaligned = mean_t_weighted_heads == 0
            mean_t_weighted_heads[misaligned] = mean_heads[misaligned]

            # verify that there are no nans in the extracted head values (one per obs)
            assert not np.any(np.isnan(mean_t_weighted_heads))

            # add the simulated heads onto the list for all periods
            mean_t_weighted_heads_df = pd.DataFrame(
                {sim_values_column: mean_t_weighted_heads}, index=obsnme)
            observed_in_period_rs[
                sim_values_column] = mean_t_weighted_heads_df[
                    sim_values_column]

        # Get head values for specified layers
        # (or closest layer if the specified layer doesn't have obs output)
        else:
            any_simulated_obs = data.obsnme.isin(
                observed_in_period_rs.obsnme).any()
            if not any_simulated_obs:
                continue
            sim_values = []
            for obsnme, layer in zip(observed_in_period_rs.obsnme,
                                     observed_in_period_rs.layer):
                obsnme_results = data.loc[obsnme]
                # if a DataFrame (with simulated values for multiple layers) is returned
                if len(obsnme_results.shape) == 2:
                    layer = obsnme_results.iloc[np.argmin(
                        obsnme_results.layer - layer)]['layer']
                    sim_value = obsnme_results.iloc[layer][sim_values_column]
                # Series (row) in results DataFrame with single simulated value
                else:
                    sim_value = obsnme_results[sim_values_column]
                sim_values.append(sim_value)
            observed_in_period_rs[sim_values_column] = sim_values

        # add stress period and observed values
        observed_in_period_rs['per'] = per
        observed_in_period_rs[obs_values_column] = observed_in_period_rs[
            observed_values_obsval_col]
        observed_simulated_combined.append(observed_in_period_rs)

    # Combined DataFrame of observed heads and simulated equivalents
    head_obs = pd.concat(observed_simulated_combined)

    # raise an error if there are duplicates- reindexing below will fail if this is the case
    if head_obs.index.duplicated().any():
        msg = (
            'The following observations have duplicate names. There should only be'
            'one observation per site, for each time period implied by the '
            'obsnme_date_suffix_format parameter.\n{}'.format(
                head_obs.loc[head_obs.duplicated()]))
        raise ValueError(msg)

    # drop any observations in specified groups
    # (e.g. lake stages that should be compared with lake package output)
    if drop_groups is not None and 'obgnme' in head_obs.columns:
        head_obs = head_obs.loc[~head_obs.obgnme.isin(drop_groups)].copy()

    # nans are where sites don't have observation values for that period
    # or sites that are in other model (inset or parent)
    head_obs.dropna(subset=[obs_values_column], axis=0, inplace=True)

    # add standard obsval and obgmne columns
    head_obs['obsval'] = head_obs[obs_values_column]
    if 'obgnme' not in head_obs.columns:
        head_obs['obgnme'] = variable_name

    # reorder the columns
    columns = [
        'datetime', 'per', 'obsprefix', 'obsnme', obs_values_column,
        sim_values_column, 'n', 'uncertainty', 'screen_top', 'screen_botm',
        'layer', 'obsval', 'obgnme'
    ]
    columns = [c for c in columns if c in head_obs.columns]
    head_obs = head_obs[columns].copy()
    if 'layer' in columns:
        head_obs['layer'] = head_obs['layer'].astype(int)

    # fill NaT (not a time) datetimes
    fill_nats(head_obs, perioddata)

    head_obs.sort_values(by=['obsprefix', 'per'], inplace=True)
    if outfile is not None:
        head_obs.fillna(-9999).to_csv(outfile, sep=' ', index=False)
        print(f'wrote {len(head_obs):,} observations to {outfile}')

        # write the instruction file
        if write_ins:
            write_insfile(head_obs,
                          str(outfile) + '.ins',
                          obsnme_column='obsnme',
                          simulated_obsval_column=sim_values_column,
                          index=False)
    return head_obs