Exemple #1
0
def despike(var, window_size, spike_method="median"):
    """
    Return a smooth baseline of data and the anomalous spikes

    This script is copied from Nathan Briggs' MATLAB script as described in
    Briggs et al (2011). It returns the baseline of the data using either a
    rolling window method and the residuals of [measurements - baseline].

    Parameters
    ----------
    arr: numpy.ndarray or pandas.Series
        Array of data variable for cleaning to be performed on.
    window_size: int
        the length of the rolling window size
    method: str
        A string with `minmax` or `median`. 'minmax' first applies a rolling
        minimum to the dataset thereafter a rolling maximum is applied. This
        forms the baseline, where the spikes are the difference from the
        baseline. 'median' first applies a rolling median to the dataset, which
        forms the baseline. The spikes are the difference between median and
        baseline, and thus are more likely to be negative.

    Returns
    -------
    baseline: numpy.ndarray or pandas.Series
        The baseline from which outliers are determined.
    spikes: numpy.ndarray or pandas.Series
        Spikes are the residual of [measurements - baseline].


    """
    from numpy import array, isnan, nan, nanmax, nanmedian, nanmin, ndarray

    # convert to array
    arr = array(var)
    # create empty array for baseline
    baseline = ndarray(arr.shape) * nan
    # mask with exisiting nans masked out
    mask = ~isnan(arr)

    # if min-max method then get the rolling minimum and
    # then the rolling maximum
    if spike_method.startswith("min"):
        base_min = rolling_window(arr[mask], nanmin, window_size)
        base = rolling_window(base_min, nanmax, window_size)
    else:
        base = rolling_window(arr[mask], nanmedian, window_size)

    baseline[mask] = base
    spikes = arr - baseline

    baseline = transfer_nc_attrs(getframe(), var, baseline, "_baseline")
    spikes = transfer_nc_attrs(getframe(), var, spikes, "_spikes")

    return baseline, spikes
Exemple #2
0
def par_scaling(par_uV, scale_factor_wet_uEm2s, sensor_output_mV):
    """
    Scaling correction for par with factory calibration coefficients.

    The function subtracts the sensor output from the raw counts and divides
    with the scale factor. The factory calibrations are unique for each
    deployment and should be taken from the calibration file for that
    deployment.

    Parameters
    ----------
    par_uV: numpy.ndarray or pandas.Series
        The raw par data with units uV.
    scale_factor_wet_uEm2s: float
        The scale factor from the factory calibration file in units uE/m2/sec.
    sensor_output_mV: float
        The sensor output in the dark from the factory calibration file in
        units mV.

    Returns
    par_uEm2s: numpy.ndarray or pandas.Series
        The par data corrected for the sensor output and scale factor from the
        factory calibration file in units uE/m2/sec.

    """
    sensor_output_uV = sensor_output_mV / 1000.0

    par_uEm2s = (par_uV - sensor_output_uV) / scale_factor_wet_uEm2s

    par_uEm2s = transfer_nc_attrs(getframe(), par_uV, par_uEm2s, 'par_uEm2s')

    return par_uEm2s
Exemple #3
0
def potential_density(salt_PSU, temp_C, pres_db, lat, lon, pres_ref=0):
    """
    Calculate density from glider measurements of salinity and temperature.

    The Basestation calculates density from absolute salinity and potential
    temperature. This function is a wrapper for this functionality, where
    potential temperature and absolute salinity are calculated first.
    Note that a reference pressure of 0 is used by default.

    Parameters
    ----------
    salt_PSU : array, dtype=float, shape=[n, ]
        practical salinty
    temp_C : array, dtype=float, shape=[n, ]
    temperature in deg C
    pres_db : array, dtype=float, shape=[n, ]
        pressure in decibar
    lat : array, dtype=float, shape=[n, ]
        latitude in degrees north
    lon : array, dtype=float, shape=[n, ]
        longitude in degrees east

    Returns
    -------
    potential_density : array, dtype=float, shape=[n, ]


    Note
    ----
    Using seawater.dens does not yield the same results as this function. We
    get very close results to what the SeaGlider Basestation returns with this
    function. The difference of this function with the basestation is on
    average ~ 0.003 kg/m3
    """

    try:
        import gsw

        salt_abs = gsw.SA_from_SP(salt_PSU, pres_db, lon, lat)
        temp_pot = gsw.t_from_CT(salt_abs, temp_C, pres_db)
        pot_dens = gsw.pot_rho_t_exact(salt_abs, temp_pot, pres_db, pres_ref)
    except ImportError:
        import seawater as sw

        pot_dens = sw.pden(salt_PSU, temp_C, pres_db, pres_ref)

    pot_dens = transfer_nc_attrs(
        getframe(),
        temp_C,
        pot_dens,
        'potential_density',
        units='kg/m3',
        comment='',
        standard_name='potential_density',
    )
    return pot_dens
Exemple #4
0
    def brunt_vaisala(salt, temp, pres, lat=None):
        r"""
        Calculate the square of the buoyancy frequency.

        This is a copy from GSW package, with the exception that
        the array maintains the same shape as the input. Note that
        it only works on ungridded data at the moment.

        .. math::

        N^{2} = \frac{-g}{\sigma_{\theta}} \frac{d\sigma_{\theta}}{dz}

        Parameters
        ----------
        SA : array-like
            Absolute Salinity, g/kg
        CT : array-like
            Conservative Temperature (ITS-90), degrees C
        p : array-like
            Sea pressure (absolute pressure minus 10.1325 dbar), dbar
        lat : array-like, 1-D, optional
            Latitude, degrees.
        axis : int, optional
            The dimension along which pressure increases.

        Returns
        -------
        N2 : array
            Buoyancy frequency-squared at pressure midpoints, 1/s.
            The shape along the pressure axis dimension is one
            less than that of the inputs.
        """

        from gsw import Nsquared
        from numpy import nan, r_

        def pad_nan(a):
            r_[a, nan]

        n2 = pad_nan(Nsquared(salt, temp, pres)[0])

        n2 = transfer_nc_attrs(
            getframe(),
            temp,
            n2,
            'N_squared',
            units='1/s2',
            comment='',
            standard_name='brunt_vaisala_freq',
        )

        return n2
Exemple #5
0
    def find_signature(sign):
        path = Path(inspect.getfile(
            inspect.getframe())).parent / 'signatures.json'
        with path.open('r') as data_file:
            data = json.load(data_file)

        list_name = [name for name, hexa in data.items() if hexa == sign]

        if len(list_name) > 1:
            logging.warning('function signatures collision: %s', list_name)
            return '_or_'.join(list_name)
        elif list_name:
            return list_name[0]
        else:
            return None
Exemple #6
0
def mask_bad_dive_fraction(mask, dives, var, mask_frac=0.2):
    """
    Find bad dives - where more than a fraction of the dive is masked

    Parameters
    ----------
    mask : array, dtype=bool, shape=[n, ]
        boolean 1D array with masked values
    dives : array, dtype=float, shape=[n, ]
        discrete dive numbers (down round, up n.5)
    var : array, dtype=float, shape=[n, ]
        series or array containing data that will be masked with NaNs
    mask_frac : int=0.2
        fraction of the dive that is masked for the whole dive to be bad

    Returns
    -------
    var : array, dtype=float, shape=[n, ]
        the same as the input, but has been masked
    mask_dives : array, dtype=bool
        a mask array that has full dives that are deemed "bad" masked out

    """
    from numpy import NaN, array
    from pandas import Series

    # catch dives where the marjority of the data is masked
    # and return a fully masked dive
    dives = array(dives)
    arr = array(var)

    grp = Series(mask).groupby(dives)
    masked_frac_per_dive = grp.sum() / grp.count() > mask_frac
    majority_masked = masked_frac_per_dive[masked_frac_per_dive].index.values

    # create a mask that masks ungridded data
    mask_dives = mask.copy()
    for d in majority_masked:
        i = array(dives) == d
        mask_dives[i] = True

    arr[mask_dives] = NaN
    baddive = arr

    baddive = transfer_nc_attrs(getframe(), var, baddive, None)

    return baddive, mask_dives
Exemple #7
0
    def spice0(salt_PSU, temp_C, pres_db, lat, lon):
        """
        Calculate spiciness from glider measurements of salinity and temperature.

        Parameters
        ----------
        salt_PSU : array, dtype=float, shape=[n, ]
            practical salinty
        temp_C : array, dtype=float, shape=[n, ]
        temperature in deg C
        pres_db : array, dtype=float, shape=[n, ]
            pressure in decibar
        lat : array, dtype=float, shape=[n, ]
            latitude in degrees north
        lon : array, dtype=float, shape=[n, ]
            longitude in degrees east

        Returns
        -------
        potential_density : array, dtype=float, shape=[n, ]


        Note
        ----
        Using seawater.dens does not yield the same results as this function. We
        get very close results to what the SeaGlider Basestation returns with this
        function. The difference of this function with the basestation is on
        average ~ 0.003 kg/m3
        """
        import gsw

        salt_abs = gsw.SA_from_SP(salt_PSU, pres_db, lon, lat)
        cons_temp = gsw.CT_from_t(salt_abs, temp_C, pres_db)

        spice0 = gsw.spiciness0(salt_abs, cons_temp)

        spice0 = transfer_nc_attrs(
            getframe(),
            temp_C,
            spice0,
            "spiciness0",
            units=" ",
            comment="",
            standard_name="spiciness0",
        )
        return spice0
Exemple #8
0
def par_dark_count(par, dives, depth, time):
    """
    Calculates an in situ dark count from the PAR sensor.

    The in situ dark count for the PAR sensor is calculated from the median,
    with masking applied for values before 23:01 and outside the 90th %

    Parameters
    ----------

    par: numpy.ndarray or pandas.Series
        The par array after factory calibration in units uE/m2/sec.
    dives: numpy.ndarray or pandas.Series
        The dive count (round is down dives, 0.5 is up dives).
    depth: numpy.ndarray or pandas.Series
        The depth array in metres.
    time: numpy.ndarray or pandas.Series
        The date & time array in a numpy.datetime64 format.

    Returns
    -------

    par_dark: numpy.ndarray or pandas.Series
        The par data corrected for the in situ dark value in units uE/m2/sec.
    """
    from numpy import array, ma, nanmedian, isnan, nanpercentile

    par_arr = array(par)
    dives = array(dives)
    depth = array(depth)
    time = array(time)

    # DARK CORRECTION FOR PAR
    hrs = time.astype('datetime64[h]') - time.astype('datetime64[D]')
    xi = ma.masked_inside(hrs.astype(int), 22, 2)  # find 23:01 hours
    yi = ma.masked_outside(depth,
                           *nanpercentile(depth[~isnan(par)],
                                          [90, 100]))  # 90th pctl of depth
    i = ~(xi.mask | yi.mask)
    dark = nanmedian(par_arr[i])
    par_dark = par_arr - dark
    par_dark[par_dark < 0] = 0

    par_dark = transfer_nc_attrs(getframe(), par, par_dark, '_dark')

    return par_dark
Exemple #9
0
def outlier_bounds_iqr(arr, multiplier=1.5):
    r"""
    Mask values outside the upper/lower outlier limits by interquartile range:

    .. math::

        lim_{low} = Q_1 - 1.5\cdot(Q_3 - Q_1)\\
        lim_{up} = Q_3 + 1.5\cdot(Q_3 - Q_1)

    the multiplier [1.5] can be adjusted by the user
    returns the lower_limit, upper_limit

    Parameters
    ----------
    arr : np.array|xr.DataArray, dtype=float, shape=[n, ]
        the full timeseries of the entire dataset
    multiplier : float=1.5
        sets the interquartile range

    Returns
    -------
    arr : array | xarray.DataArray
        A data object where values outside the limits are masked.
        Metdata will be preserved if the original input array is xr.DataArray


    """
    from numpy import array, nan, nanpercentile

    var = arr.copy()
    arr = array(arr)

    q1, q3 = nanpercentile(arr, [25, 75])
    iqr = q3 - q1

    ll = q1 - iqr * multiplier
    ul = q3 + iqr * multiplier

    mask = (arr < ll) | (arr > ul)
    arr[mask] = nan

    attrs = dict(outlier_lims=[ll, ul])

    out = transfer_nc_attrs(getframe(), var, arr, "_outlierIQR", **attrs)
    return out
Exemple #10
0
def outlier_bounds_std(arr, multiplier=3):
    r"""
    Mask values outside the upper and lower outlier limits by standard
    deviation

        :math:`\mu \pm 3\sigma`

    the multiplier [3] can be adjusted by the user
    returns the lower_limit, upper_limit

    Parameters
    ----------
    arr : np.array|xr.DataArray, dtype=float, shape=[n, ]
        the full timeseries of the entire dataset
    multiplier : float=1.5
        sets the standard deviation multiplier

    Returns
    -------
    arr : array | xarray.DataArray
        A data object where values outside the limits are masked.
        Metdata will be preserved if the original input array is xr.DataArray

    """

    from numpy import array, nan, nanmean, nanstd

    var = arr.copy()
    arr = array(arr)

    mean = nanmean(arr)
    std = nanstd(arr)

    ll = mean - std * multiplier
    ul = mean + std * multiplier

    mask = (arr < ll) | (arr > ul)
    arr[mask] = nan

    attrs = dict(outlier_lims=[ll, ul])

    out = transfer_nc_attrs(getframe(), var, arr, "_outlierSTD", **attrs)

    return out
Exemple #11
0
    def predict(self, x):
        """
        A wrapper around the normal predict function that takes
        nans into account. An extra dimension is also added if needed.
        """
        from xarray import DataArray

        var = x.copy()
        x = _np.array(x)
        out = _np.ndarray(x.size) * _np.NaN
        i = ~_np.isnan(x)
        x = x[i].reshape(-1, 1)
        out[i.squeeze()] = self._predict(x).squeeze()

        out = transfer_nc_attrs(getframe(), var, out, "_calibrated")
        if hasattr(self, "info") & isinstance(out, DataArray):
            out.attrs["model_info"] = str(self.info)

        return out
Exemple #12
0
def fluorescence_dark_count(flr, depth, percentile=5):
    """
    Calculates an in situ dark count from the fluorescence sensor.

    The in situ dark count for the fluorescence sensor is calculated from the
    user-defined percentile between 300 and 400m.

    Parameters
    ----------

    flr: numpy.ndarray or pandas.Series
        The fluorescence array after factory calibration.
    depth: numpy.ndarray or pandas.Series
        The depth array in metres.

    Returns
    -------

    flr: numpy.ndarray or pandas.Series
        The fluorescence data corrected for the in situ dark value.

    """
    from numpy import array, isnan, nanpercentile
    import warnings

    mask = (depth > 300) & (depth < 400)
    flr_dark = array(flr)

    if (~isnan(flr_dark[mask])).sum() == 0:
        warnings.warn(
            "\nThere are no fluorescence measurements between "
            "300 and 400 metres.\nThe dark count correction "
            "cannot be made and fluorescence data can't be processed.",
            UserWarning,
        )
    dark_pctl = nanpercentile(flr_dark[mask], percentile)
    flr_dark -= dark_pctl
    flr_dark[flr_dark < 0] = 0

    flr_dark = transfer_nc_attrs(getframe(), flr, flr_dark, "_dark")

    return flr_dark
Exemple #13
0
def backscatter_dark_count(bbp, depth, percentile=5):
    """
    Calculates an in situ dark count from the backscatter sensor.

    The in situ dark count for the backscatter sensor is calculated from the
    user-defined percentile between 200 and 400m.

    Parameters
    ----------

    bbp: numpy.ndarray or pandas.Series
        The total backscatter array after factory calibration in m-1.
    depth: numpy.ndarray or pandas.Series
        The depth array in metres.

    Returns
    -------

    bbp: numpy.ndarray or pandas.Series
        The total backscatter data corrected for the in situ dark value.
    """
    from numpy import array, isnan, nanpercentile
    import warnings

    bbp_dark = array(bbp)
    mask = (depth > 200) & (depth < 400)
    if (~isnan(bbp[mask])).sum() == 0:
        warnings.warn(
            "There are no backscatter measurements between 200 "
            "and 400 metres.The dark count correction cannot be "
            "made and backscatter data can't be processed.",
            UserWarning,
        )

    dark_pctl = nanpercentile(bbp_dark[mask], percentile)
    bbp_dark -= dark_pctl
    bbp_dark[bbp_dark < 0] = 0

    bbp_dark = transfer_nc_attrs(getframe(), bbp, bbp_dark, "_dark")

    return bbp_dark
Exemple #14
0
def rolling_window(var, func, window):
    """
    A rolling window function that is nan-resiliant

    Parameters
    ----------
    arr:array, dtype=float, shape=[n, ]
        array that you want to pass the rolling window over
    func : callable
        an aggregating function. e.g. mean, std, median
    window : int
        the size of the rolling window that will be applied

    Returns
    -------
    arr : array, dtype=float, shape=[n, ]
        the same as the input array, but the rolling window has been applied
    """
    from numpy import array, nan, ndarray, r_

    n = window
    # create an empty 2D array with shape (window, len(arr))
    arr = array(var)
    mat = ndarray([n, len(arr) - n]) * nan
    # create a vector for each window
    for i in range(n):
        mat[i, :] = arr[i:i - n]
    # get the mean or meidan or any other function of the matrix
    out = func(mat, axis=0)

    # the array will be shorter than the original
    # pad the output with the rolling average of the values left out
    i0 = n // 2
    i1 = n - i0
    seg0 = array([func(arr[:i + 1]) for i in range(i0)])
    seg1 = array([func(arr[-i - 1:]) for i in range(i1)])
    rolwin = r_[seg0, out, seg1]

    rolwin = transfer_nc_attrs(getframe(), var, rolwin, "_rollwin")

    return rolwin
Exemple #15
0
def fluorescence_dark_count(flr, depth):
    """
    Calculates an in situ dark count from the fluorescence sensor.

    The in situ dark count for the fluorescence sensor is calculated from the
    95th percentile between 300 and 400m.

    Parameters
    ----------

    flr: numpy.ndarray or pandas.Series
        The fluorescence array after factory calibration.
    depth: numpy.ndarray or pandas.Series
        The depth array in metres.

    Returns
    -------

    flr: numpy.ndarray or pandas.Series
        The fluorescence data corrected for the in situ dark value.

    """
    from numpy import nanpercentile, isnan, array

    mask = (depth > 300) & (depth < 400)
    flr_dark = array(flr)

    if (~isnan(flr_dark[mask])).sum() == 0:
        raise UserWarning(
            '\nThere are no fluorescence measurements between '
            '300 and 400 metres.\nThe dark count correction '
            "cannot be made and fluorescence data can't be processed.")
    dark_pctl5 = nanpercentile(flr_dark[mask], 5)

    flr_dark -= dark_pctl5
    flr_dark[flr_dark < 0] = 0
    flr_dark = transfer_nc_attrs(getframe(), flr, flr_dark, '_dark')

    return flr_dark
Exemple #16
0
def time_average_per_dive(dives, time):
    """
    Gets the average time stamp per dive. This is used to create psuedo
    discrete time steps per dive for plotting data (using time as x-axis
    variable).

    Parameters
    ----------
    dives : np.array, dtype=float, shape=[n, ]
        discrete dive numbers (down = d.0; up = d.5) that matches time length
    time : np.array, dtype=datetime64, shape=[n, ]
        time stamp for each observed measurement

    Returns
    -------
    time_average_per_dive : np.array, dtype=datetime64, shape=[n, ]
        each dive will have the average time stamp of that dive. Can be used
        for plotting where time_average_per_dive is set as the x-axis.
    """
    from numpy import array, datetime64, nanmean
    from pandas import Series

    atime = array(time)
    dives = array(dives)
    if isinstance(atime[0], datetime64):
        t = atime.astype("datetime64[s]").astype(float)
    else:
        t = atime

    t_grp = Series(t).groupby(dives)
    t_mid = nanmean([t_grp.max(), t_grp.min()], axis=0)
    t_ser = Series(t_mid, index=t_grp.mean().index.values)
    diveavg = t_ser.reindex(index=dives).values
    diveavg = diveavg.astype("datetime64[s]")

    diveavg = transfer_nc_attrs(getframe(), time, diveavg, "_diveavg")

    return diveavg
Exemple #17
0
def backscatter_dark_count(bbp, depth):
    """
    Calculates an in situ dark count from the backscatter sensor.

    The in situ dark count for the backscatter sensor is calculated from the
    95th percentile between 200 and 400m.

    Parameters
    ----------

    bbp: numpy.ndarray or pandas.Series
        The total backscatter array after factory calibration in m-1.
    depth: numpy.ndarray or pandas.Series
        The depth array in metres.

    Returns
    -------

    bbp: numpy.ndarray or pandas.Series
        The total backscatter data corrected for the in situ dark value.
    """
    from numpy import nanpercentile, isnan, array

    bbp_dark = array(bbp)
    mask = (depth > 200) & (depth < 400)
    if (~isnan(bbp[mask])).sum() == 0:
        raise UserWarning('There are no backscatter measurements between 200 '
                          'and 400 metres.The dark count correction cannot be '
                          "made and backscatter data can't be processed.")
    dark_pctl5 = nanpercentile(bbp_dark[mask], 5)

    bbp_dark -= dark_pctl5
    bbp_dark[bbp_dark < 0] = 0

    bbp_dark = transfer_nc_attrs(getframe(), bbp, bbp_dark, '_dark')

    return bbp
Exemple #18
0
def quenching_correction(
    flr,
    bbp,
    dives,
    depth,
    time,
    lat,
    lon,
    max_photic_depth=100,
    night_day_group=True,
    surface_layer=5,
    sunrise_sunset_offset=1,
):
    """
    Corrects the fluorescence data based upon Thomalla et al. (2017).

    The function calculates the quenching depth and performs the quenching
    correction based on the fluorescence to backscatter ratio. The quenching
    depth is calculated based upon the different between night and daytime
    fluorescence. The default setting is for the preceding night to be used to
    correct the following day's quenching (`night_day_group=True`). This can
    be changed so that the following night is used to correct the preceding
    day. The quenching depth is then found from the difference between the
    night and daytime fluorescence, using the steepest gradient of the {5
    minimum differences and the points the difference changes sign (+ve/-ve)}.
    The function gets the backscatter/fluorescence ratio between from the
    quenching depth to the surface, and then calculates a mean nighttime
    ratio for each night. The quenching ratio is calculated from the nighttime
    ratio and the daytime ratio, which is then applied to fluorescence to
    correct for quenching. If the corrected value is less than raw, then the
    function will return the original raw data.

    Parameters
    ----------
    flr: numpy.ndarray or pandas.Series
        fluorescence data after cleaning and factory calibration conversion
    bbp: numpy.ndarray or pandas.Series
        Total backscatter after cleaning and factory calibration conversion
    dives: numpy.ndarray or pandas.Series
        The dive count (round is down dives, 0.5 is up dives).
    depth: numpy.ndarray or pandas.Series
        The depth array in metres.
    time: numpy.ndarray or pandas.Series
        The date & time array in a numpy.datetime64 format.
    lat: numpy.ndarray or pandas.Series
        The latitude of the glider position.
    lon: numpy.ndarray or pandas.Series
        The longitude of the glider position.
    max_photic_depth: int
        Limit the quenching correction to depth less than a given value [100].
    night_day_group: bool
        If True, use preceding night otherwise use following night for
        calculating the flr:bbp ratio.
    surface_layer: int
        The surface depth that is omitted from the correction calculations
        (metres)
    sunrise_sunset_offset: int
        The delayed onset and recovery of quenching in hours [1]
        (assumes symmetrical).

    Returns
    -------
    flr_corrected: numpy.ndarray or pandas.Series
        The fluorescence data corrected for quenching.
    quenching layer: bool
        A boolean mask of where the fluorescence is quenched.

    """

    import numpy as np
    import pandas as pd
    from scipy.interpolate import Rbf
    from .cleaning import rolling_window

    def grad_min(depth, fluor_diff, surface_layer=5):
        """
        TODO:   need to refine this function. Doesn't always correct to
                the deepest quenching point
        Quenching depth for a day/night fluorescence difference

        INPUT:   depth and fluorescence as pd.Series or np.ndarray
                 surface_layer [5] is the depth to search for the
                     reference in the gradient
        OUPUT:   Quenching layer as a boolean mask
        """
        if depth.size <= surface_layer:
            return np.zeros(depth.size).astype(bool)

        x = np.array(depth)
        y = rolling_window(np.array(fluor_diff), np.nanmean, 5)
        s = x < surface_layer  # surface data to the top 5 metres
        mask = np.zeros(depth.size).astype(bool)

        # get the smallest 5 points and where the difference crosses 0
        small5 = np.argsort(np.abs(y))[:5]
        cross0 = np.where(np.r_[False, np.diff((y) > 0)])[0]
        # combine the indicies
        i = np.unique(np.r_[small5, cross0])
        # the max in the surface as a reference
        if not s.sum():
            return mask
        j = y[s].argmax()

        # calculate the gradient of the selected points to the reference
        grad = (y[s][j] - y[i]) / (x[s][j] - x[i])
        # If there are only nans in the gradient return only nans
        if np.isnan(grad).all():
            return mask
        # get the index of the steepest gradient (min)
        grad_min_i = i[np.nanargmin(grad)]

        # fill the mask with True values above the quenching depth
        mask[0:grad_min_i] = True
        # on up dives the array is backwards so reverse the mask
        if x[-1] < x[0]:
            mask = ~mask
        # If the majority of the points in the selected region are
        # negative (night < day) then return an empty mask
        return mask

    var = flr.copy()  # create a copy for netCDF attrs preservation

    flr = np.array(flr)
    bbp = np.array(bbp)
    dives = np.array(dives)
    depth = np.array(depth)
    time = np.array(time)
    lat = np.array(lat)
    lon = np.array(lon)

    # ############################ #
    #  GENERATE DAY/NIGHT BATCHES  #
    # ############################ #
    sunrise, sunset = sunset_sunrise(time, lat, lon)
    offset = np.timedelta64(sunrise_sunset_offset, 'h')
    # creating quenching correction batches, where a batch is a night and the
    # following day
    day = (time > (sunrise + offset)) & (time < (sunset + offset))
    # find day and night transitions
    daynight_transitions = np.abs(np.diff(day.astype(int)))
    # get the cumulative sum of daynight to generate separate batches for day
    # and night
    daynight_batches = daynight_transitions.cumsum()
    # now get the batches with padded 0 to account for the diff
    # also add a bool that makes night_day or day_night batches
    batch = np.r_[0, (daynight_batches + night_day_group) // 2]
    isday = (np.r_[0, daynight_batches] / 2 % 1) == 0

    # ######################## #
    #  GET NIGHTTIME AVERAGES  #
    # ######################## #
    # blank arrays to be filled
    flr_night, bbp_night = flr.copy(), bbp.copy()

    # create a dataframe with fluorescence and backscatter
    df = pd.DataFrame(np.c_[flr, bbp], columns=['flr', 'bbp'])
    # get the binned averages for each batch and select the night
    night_ave = df.groupby([day, batch, np.around(depth)]).mean()
    night_ave = night_ave.dropna().loc[False]
    # A second group where only batches are grouped
    grp_batch = df.groupby(batch)

    # GETTING NIGHTTIME AVERAGE FOR NONGRIDDED DATA - USE RBF INTERPOLATION
    for b in np.unique(night_ave.index.labels[0]):
        i = grp_batch.groups[b].values  # batch index
        j = i[~np.isnan(flr[i]) & (depth[i] < 400)]  # index without nans
        x = night_ave.loc[b].index.values  # batch depth
        y = night_ave.loc[b]  # batch flr and bbp

        if y.flr.isna().all() | y.bbp.isna().all():
            continue
        elif y.flr.size <= 2:
            continue
        # radial basis functions with a smoothing factor
        f1 = Rbf(x, y.flr.values, function='linear', smooth=20)
        f2 = Rbf(x, y.bbp.values, function='linear', smooth=20)
        # interpolation function is used to find flr and bbp for all
        # nighttime fluorescence
        flr_night[j] = f1(depth[j])
        bbp_night[j] = f2(depth[j])

    # calculate the difference between average nighttime - and fluorescence
    fluor_diff = flr_night - flr

    # ################################ #
    #  FIND THE QUENCHING DEPTH LAYER  #
    # ################################ #
    # create a "photic layer" mask to which calc will be limited daytime,
    # shalower than [100m] and fluoresence is quenched relative to night
    photic_layer = isday & (depth < max_photic_depth) & (fluor_diff > 0)
    # blank array to be filled
    quenching_layer = np.zeros(depth.size).astype(bool)
    # create a grouped dataset by dives to find the depth of quenching
    cols = np.c_[depth, fluor_diff, dives][photic_layer]
    grp = pd.DataFrame(cols, columns=['depth', 'flr_dif', 'dives'])
    grp = grp.groupby('dives')
    # apply the minimum gradient algorithm to each dive
    quench_mask = grp.apply(lambda df: grad_min(df.depth, df.flr_dif))
    # fill the quench_layer subscripted to the photic layer
    quenching_layer[photic_layer] = np.concatenate([l for l in quench_mask])

    # ################################### #
    #  DO THE QUENCHING CORRECTION MAGIC  #
    # ################################### #
    # a copy of fluorescence to be filled with quenching corrected data
    flr_corrected = flr.copy()
    # nighttime backscatter to fluorescence ratio
    flr_bb_night = flr_night / bbp_night
    # quenching ratio for nighttime
    quench_ratio = flr_bb_night * bbp / flr
    # apply the quenching ratio to the fluorescence
    quench_corrected = flr * quench_ratio
    # if unquenched data is corrected return the original data
    mask = quench_corrected < flr
    quench_corrected[mask] = flr[mask]
    # fill the array with queching corrected data in the quenching layer only
    flr_corrected[quenching_layer] = quench_corrected[quenching_layer]

    flr_corrected = transfer_nc_attrs(getframe(),
                                      var,
                                      flr_corrected,
                                      'flr_quench_corrected',
                                      units='RFU')
    quenching_layer = transfer_nc_attrs(getframe(),
                                        var,
                                        quenching_layer,
                                        'quench_layer',
                                        units='')

    return flr_corrected, quenching_layer
Exemple #19
0
def par_fill_surface(par, dives, depth, max_curve_depth=100):
    """
    Algebraically calculates the top 5 metres of the par profile.

    The function removes the top 5 metres of par data, and then using an
    exponential equation calculates the complete profile.

    Parameters
    ----------
    par: numpy.ndarray or pandas.Series
        The par data with units uE/m2/sec.
    dives: numpy.ndarray or pandas.Series
        The dive count (round is down dives, 0.5 is up dives).
    depth: numpy.ndarray or pandas.Series
        The depth array in metres.
    max_curve_depth: int
        The maximum depth of which to fit the exponential function.


    Returns
    -------
    par_filled: numpy.ndarray or pandas.Series
        The par data with the algebraically calculated top 5 metres.

    """
    from scipy.optimize import curve_fit
    import numpy as np

    def dive_par_fit(depth, par):
        def exp_func(x, a, b):
            return a * np.exp(b * x)

        xj, yj = depth, par
        mask = ~(np.isnan(xj) | np.isnan(yj)) & (xj < max_curve_depth)
        xm, ym = xj[mask], yj[mask]

        if all(ym == 0) | (mask.sum() <= 2):
            yj_hat = np.ones_like(depth) * np.nan
        else:
            try:
                [a, b], _ = curve_fit(exp_func,
                                      xm,
                                      ym,
                                      p0=(500, -0.03),
                                      maxfev=1000)
                yj_hat = exp_func(xj, a, b)
            except RuntimeError:
                yj_hat = np.ones_like(depth) * np.nan

        return yj_hat

    var = par.copy()
    par = np.array(par)
    dives = np.array(dives)
    depth = np.array(depth)

    par_filled = np.ones_like(depth) * np.nan
    for d in np.unique(dives):
        i = dives == d
        par_fit = dive_par_fit(depth[i], par[i])
        par_filled[i] = par_fit

    par_filled = transfer_nc_attrs(getframe(), var, par_filled, 'par_expfill')

    return par_filled
Exemple #20
0
def bottle_matchup(
    gld_dives,
    gld_depth,
    gld_time,
    btl_depth,
    btl_time,
    btl_values,
    min_depth_diff_metres=5,
    min_time_diff_minutes=120,
):
    """
    Performs a matchup between glider and bottle samples based on time and
    depth (or density).

    Parameters
    ----------
    gld_depth : np.array, dtype=float
        glider depth at time of measurement
    gld_dives : np.array, dtype=float
        dive index of the glider (given by glider toolbox)
    gld_time : np.array, dtype=datetime64
        glider time that will be used as primary indexing variable
    btl_time: np.array, dtype=datetime64
        in-situ bottle sample's time
    btl_depth : np.array, dtype=float
        depth of in-situ sample
    btl_values : np.array, dtype=float
        the value that will be interpolated onto the glider time and
        depth coordinates (time, depth/dens)
    min_depth_diff_metres : float, default=5
        the minimum allowable depth difference
    min_time_diff_minutes : float, default=120
        the minimum allowable time difference between bottles and glider

    Returns
    -------
    array : float
        Returns the bottle values in the format of the glider
        i.e. the length of the output will be the same as gld_*

    """
    from pandas import Series

    # metadata preservation
    var = gld_depth.copy()
    if isinstance(btl_values, Series):
        var_name = btl_values.name + "_bottle_matchups"
    else:
        var_name = "bottle_matchups"

    # make all input variables np.arrays
    args = gld_time, gld_depth, gld_dives, btl_time, btl_depth, btl_values
    gld_time, gld_depth, gld_dives, btl_time, btl_depth, btl_values = map(
        _np.array, args
    )

    # create a blank array that matches glider data
    # (placeholder for calibration bottle values)
    gld_cal = _np.ones_like(gld_depth) * _np.nan

    # loop through each ship based CTD station
    stations = _np.unique(btl_time)
    for c, t in enumerate(stations):
        # index of station from ship CTD
        btl_idx = t == btl_time
        # number of samples per station
        btl_num = btl_idx.sum()

        # string representation of station time
        t_str = str(t.astype("datetime64[m]")).replace("T", " ")
        t_dif = abs(gld_time - t).astype("timedelta64[m]").astype(float)

        # loop through depths for the station
        if t_dif.min() < min_time_diff_minutes:
            # index of dive where minimum difference occurs
            i = _np.where(gld_dives[_np.nanargmin(t_dif)] == gld_dives)[0]
            n_depths = 0
            for depth in btl_depth[btl_idx]:
                # an index for bottle where depth and station match
                j = btl_idx & (depth == btl_depth)
                # depth difference for glider profile
                d_dif = abs(gld_depth - depth)[i]
                # only match depth if diff is less than given threshold
                if _np.nanmin(d_dif) < min_depth_diff_metres:
                    # index of min diff for this dive
                    k = i[_np.nanargmin(d_dif)]
                    # assign the bottle values to the calibration output
                    gld_cal[k] = btl_values[j]
                    n_depths += 1
            print(
                (
                    "[stn {}/{}] SUCCESS: {} ({} of {} samples) match-up "
                    "within {} minutes"
                ).format(c, stations.size, t_str, n_depths, btl_num, t_dif.min())
            )
        else:
            print(
                (
                    "[stn {}/{}]  FAILED: {} Couldn't find samples within "
                    "constraints"
                ).format(c, stations.size, t_str)
            )

    attrs = dict(units="", positive="", comment="", standard_name="", axis="")
    gld_cal = transfer_nc_attrs(getframe(), var, gld_cal, var_name, **attrs)

    return gld_cal
Exemple #21
0
def horizontal_diff_outliers(dives,
                             depth,
                             arr,
                             multiplier=1.5,
                             depth_threshold=450,
                             mask_frac=0.2):
    """
    Find Z-score outliers (> 3) on the horizontal. Can be limited below a
    certain depth.

    The function uses the horizontal gradient as a threshold, below a defined
    depth threshold to find outliers. Useful to identify when a variable at
    depth is not the same as neighbouring values.

    Parameters
    ----------

    dives: numpy.ndarray or pandas.Series
        The dive count (round is down dives, 0.5 is up dives)
    depth: numpy.ndarray or pandas.Series
        The depth array in metres
    arr: numpy.ndarray or pandas.Series
        Array of data variable for cleaning to be performed on.
    multiplier: float
        A z-score threshold
    depth_threshold: int
        Outliers will be identified below this depth value to the max depth
        value of the dive.
    mask_frac: float
        When the ratio of bad values per dive is greater than this value, then
        the dive will be masked.

    Returns
    -------
    mask
        A mask of dives where the bad values per dive ratio is greater than
        mask_frac.
    """
    from numpy import abs, arange, array, inf, nanmean, nanstd

    from .mapping import grid_data

    var = arr.copy()
    dives = array(dives)
    depth = array(depth)
    arr = array(arr)

    # grid data so that the horizontal rolling median can be calculated
    # we use a window of 3 to find only "horizonal spikes"
    gridded = grid_data(
        dives,
        depth,
        array(arr),
        bins=arange(0, depth.max(), 1),
        verbose=False,
        return_xarray=False,
    )
    median = gridded.rolling(3, axis=1, center=True, min_periods=2).median()
    # get zscore of the difference between the median and the raw data
    diff = gridded - median
    zdiff = abs(diff - nanmean(diff)) / nanstd(diff)

    # this finds the 99.7th percentile outliers
    # note that this is based on the global horizonal diff
    # but is only applied below the depth threshold
    # this means that the surface data sets a higher limit
    deep_outlier = zdiff.loc[depth_threshold:] >= multiplier

    # get the ratio of bad values per dive and mask if it
    # exceeds a user defined fraction
    deep_outlier_count = deep_outlier.sum()
    deep_obs_num = gridded.shape[0] - depth_threshold  # assumes bin of 1m
    deep_outlier_ratio = deep_outlier_count / deep_obs_num
    # finds the index where dives exceed the mask_frac threshold
    i = deep_outlier_ratio > mask_frac
    deep_outlier_dives = i[i].index.values

    mask = arr < -inf  # create a dummy mask
    for d in deep_outlier_dives:
        i = dives == d
        mask[i] = True

    baddives = mask_bad_dive_fraction(mask, dives, arr, mask_frac=mask_frac)[0]
    out = transfer_nc_attrs(getframe(), var, baddives, "_horzOutlierSTD")

    return out
Exemple #22
0
def calc_physics(
    variable,
    dives,
    depth,
    spike_window=3,
    spike_method="minmax",
    iqr=1.5,
    depth_threshold=400,
    mask_frac=0.2,
    savitzky_golay_window=11,
    savitzky_golay_order=2,
    verbose=True,
    name="Physics Variable",
):
    """
    A standard setup for processing physics variables (temperature, salinity).

    The function applies a neighbourhood interquartile range (IQR)
    outlier filter, the Briggs et al. (2011) spike filter
    followed by a Savitzky-Golay smoothing function.

    The Savitzky-Golay filter is demonstrated well on wikipedia:
    https://en.wikipedia.org/wiki/Savitzky-Golay_filter
    """

    from numpy import array, isnan

    from .cleaning import (
        despike,
        horizontal_diff_outliers,
        outlier_bounds_iqr,
        savitzky_golay,
    )

    # an interpolation step is added so that no nans are created.
    # Note that this interpolates on the flattened series
    var = variable.copy()  # attribute preservation

    x = array(dives)
    y = array(depth)
    z = array(variable)
    printv(verbose, "\n" + "=" * 50 + "\n{}:".format(name))

    if iqr:
        nans_before = isnan(z).sum()
        z = outlier_bounds_iqr(z, multiplier=iqr)
        nans_after = isnan(z).sum()
        n_masked = nans_after - nans_before
        printv(
            verbose,
            "\tRemoving outliers with IQR * {}: {} obs".format(iqr, n_masked),
        )

    if spike_window:
        z = despike(z, spike_window, spike_method)[0]
        printv(
            verbose,
            "\tRemoving spikes with rolling median (spike window={})".format(
                spike_window),
        )

    if depth_threshold:
        z = horizontal_diff_outliers(x, y, z, iqr, depth_threshold, mask_frac)
        printv(
            verbose,
            ("\tRemoving horizontal outliers "
             "(fraction={}, multiplier={})").format(mask_frac, iqr),
        )

    if savitzky_golay_window:
        printv(
            verbose,
            ("\tSmoothing with Savitzky-Golay filter "
             "(window={}, order={})").format(savitzky_golay_window,
                                             savitzky_golay_order),
        )
        z = savitzky_golay(z, savitzky_golay_window, savitzky_golay_order)

    z = transfer_nc_attrs(getframe(), var, z, "_processed")

    return z
Exemple #23
0
def savitzky_golay(var, window_size, order, deriv=0, rate=1, interpolate=True):
    """
    Smooth (and optionally differentiate) data with a Savitzky-Golay filter.

    The Savitzky-Golay filter removes high frequency noise from data [1]_.
    It has the advantage of preserving the original shape and features of the
    signal better than other types of filtering approaches, such as moving
    averages techniques. By default, nans in the array are interpolated with a
    limit set to the window size of the dataset before smoothing. The nans are
    inserted back into the dataset after the convolution. This limits the loss
    of data over blocks where there are nans. This can be switched off with the
    `interpolate` keyword arguement.

    Parameters
    ----------
    var : array, dtype=float, shape=[n, ]
        the values of the time history of the signal.
    window_size : int
        the length of the window. Must be an odd integer number.
    order : int
        the order of the polynomial used in the filtering.
        Must be less then `window_size` - 1.
    deriv : int
        the order of the derivative to compute (default = 0 means only
        smoothing)
    interpolate : bool=True
        By default, nans in the array are interpolated with a limit set to
        the window size of the dataset before smoothing. The nans are
        inserted back into the dataset after the convolution. This limits
        the loss of data over blocks where there are nans. This can be
        switched off with the `interpolate` keyword arguement.

    Returns
    -------
    ys : ndarray, shape (N)
        the smoothed signal (or it's n-th derivative).

    Notes
    -----
    The Savitzky-Golay is a type of low-pass filter, particularly
    suited for smoothing noisy data. The main idea behind this
    approach is to make for each point a least-square fit with a
    polynomial of high order over a odd-sized window centered at
    the point [2]_.

    Examples
    --------
    >>> t = linspace(-4, 4, 500)
        y = exp( -t**2 ) + random.normal(0, 0.05, t.shape)
        ysg = savitzky_golay(y, window_size=31, order=4)
        import matplotlib.pyplot as plt
        plt.plot(t, y, label='Noisy signal')
        plt.plot(t, exp(-t**2), 'k', lw=1.5, label='Original signal')
        plt.plot(t, ysg, 'r', label='Filtered signal')
        plt.legend()
        plt.show()

    References
    ----------
    .. [1] A. Savitzky, M. J. E. Golay, Smoothing and Differentiation of
       Data by Simplified Least Squares Procedures. Analytical
       Chemistry, 1964, 36 (8), pp 1627-1639.
    .. [2] Numerical Recipes 3rd Edition: The Art of Scientific Computing
       W.H. Press, S.A. Teukolsky, W.T. Vetterling, B.P. Flannery
       Cambridge University Press ISBN-13: 9780521880688
    """
    from math import factorial

    from numpy import abs, array, concatenate, convolve, isnan, linalg, mat, nan
    from pandas import Series

    # sorting out window stuff
    arr = array(var)
    try:
        window_size = abs(int(window_size))
        order = abs(int(order))
    except ValueError:
        raise ValueError("window_size and order have to be of type int")
    if window_size % 2 != 1 or window_size < 1:
        raise TypeError("window_size size must be a positive odd number")
    if window_size < order + 2:
        raise TypeError("window_size is too small for the polynomial order")
    order_range = range(order + 1)
    half_window = (window_size - 1) // 2

    # allow to interpolate for the window size
    if interpolate:
        ser = Series(arr).interpolate()
        y = array(ser)
    else:
        y = array(arr)

    # precompute coefficients
    b = mat([[k**i for i in order_range]
             for k in range(-half_window, half_window + 1)])
    m = linalg.pinv(b).A[deriv] * rate**deriv * factorial(deriv)
    # pad the signal at the extremes with
    # values taken from the signal itself
    firstvals = y[0] - abs(y[1:half_window + 1][::-1] - y[0])
    lastvals = y[-1] + abs(y[-half_window - 1:-1][::-1] - y[-1])
    y = concatenate((firstvals, y, lastvals))

    savgol = convolve(m[::-1], y, mode="valid")

    oldnans = isnan(arr)
    savgol[oldnans] = nan

    savgol = transfer_nc_attrs(getframe(), var, savgol, "_savgolay")

    return savgol
Exemple #24
0
def calc_backscatter(
    bb_raw,
    tempC,
    salt,
    dives,
    depth,
    wavelength,
    dark_count,
    scale_factor,
    spike_window=7,
    spike_method="median",
    iqr=3,
    profiles_ref_depth=300,
    deep_multiplier=1,
    deep_method="median",
    return_figure=False,
    verbose=True,
):
    r"""
    The function processes the raw backscattering data in counts into total
    backscatter (bbp) in metres.

    The function uses a series of steps to clean the data before applying the
    Zhang et al. (2009) functions to convert the data into total backscatter
    (bbp/m)). The function uses functions from the flo_functions toolkit [1]_.
    The theta angle of sensors (124deg) and xfactor for theta 124 (1.076) are
    set values that should be updated if you are not using a WetLabs ECO BB2FL

    The following standard sequence is applied:

    1. find IQR outliers  (i.e. data values outside of the lower and upper
       limits calculated by cleaning.outlier_bounds_iqr)
    2. find_bad_profiles  (e.g. high values below 300 m are counted as bad
       profiles)
    3. flo_scale_and_offset (factory scale and offset)
    4. flo_bback_total  (total backscatter based on Zhang et al. 2009) [2]_
    5. backscatter_dark_count  (based on Briggs et al. 2011) [3]_
    6. despike  (using Briggs et al. 2011 - rolling min--max) [3]_

    Parameters
    ----------

    bb_raw: np.array / pd.Series, dtype=float, shape=[n, ]
        The raw output from the backscatter channel in counts.
    tempC: np.array / pd.Series, dtype=float, shape=[n, ]
        The QC'd temperature data in degC.
    salt: np.array / pd.Series, dtype=float, shape=[n, ]
        The QC'd salinity in PSU.
    dives: np.array / pd.Series, dtype=float, shape=[n, ]
        The dive count (round is down dives, 0.5 is up dives).
    depth: np.array / pd.Series, dtype=float, shape=[n, ]
        The depth array in metres.
    wavelength: int
        The wavelength of the backscatter channel, e.g. 700 nm.
    dark_count: float
        The dark count factory values from the calibration sheet.
    scale_factor: float
        The scale factor factory values from the calibration sheet.
    spike_window: int
        The window size over which to run the despiking method.
    spike_method: str
        Whether to use a rolling median or combination of min+max filter as
        the despiking method.
    iqr: int
        Multiplier to determine the lower and upper limits of the
        interquartile range for outlier detection.
    profiles_ref_depth: int
        The depth threshold for optics.find_bad_profiles below which the
        median or mean is calculated for identifying outliers.
    deep_multiplier: int=1
        The standard deviation multiplier for calculating outliers,
        i.e. :math:`\mu \pm \sigma \cdot[1]`.
    deep_method: str
        Whether to use the deep median or deep mean to determine bad profiles
        for optics.find_bad_profiles.
    return_figure: bool
        If True, will return a figure object that shows before and after the
        quenching correction was applied.
    verbose: bool
        If True, will print the progress of the processing function.

    Returns
    -------
    baseline: numpy.ma.masked_array
        The despiked + bad profile identified backscatter with the mask
        denoting the filtered values of the backscatter baseline as
        defined in Briggs et al. (2011).
    quench_corrected: np.array / pd.Series, dtype=float, shape=[n, ]
        The backscatter spikes as defined in Briggs et al. (2011).
    figs: object
        The figures reporting the despiking, bad profiles and quenching
        correction.

    References
    ----------
    .. [1] https://github.com/ooici/ion-functions Copyright (c) 2010, 2011 The
           Regents of the University of California
    .. [2] Zhang, X., Hu, L., & He, M. (2009). Scattering by pure seawater:
           Effect of salinity. Optics Express, 17(7), 5698.
           https://doi.org/10.1364/OE.17.005698
    .. [3] Briggs, N., Perry, M. J., Cetinic, I., Lee, C., D'Asaro, E., Gray,
           A. M., & Rehm, E. (2011). High-resolution observations of aggregate
           flux during a sub-polar North Atlantic spring bloom. Deep-Sea
           Research Part I: Oceanographic Research Papers, 58(10), 1031–1039.
           https://doi.org/10.1016/j.dsr.2011.07.007


    """
    from numpy import array, count_nonzero, isnan, nan, unique
    from pandas import Series

    from . import flo_functions as ff
    from . import optics as op
    from .cleaning import despike, despiking_report, outlier_bounds_iqr

    var = bb_raw.copy()  # metadata preservation
    bb_raw = Series(bb_raw.copy())
    dives = array(dives)
    depth = array(depth)
    tempC = array(tempC)
    salt = array(salt)

    name = "bb{:.0f}".format(wavelength)
    theta = 124  # factory set angle of optical sensors
    xfactor = 1.076  # for theta 124
    # Values taken from Sullivan et al. (2013) & Slade and Boss (2015)

    ref_depth = profiles_ref_depth
    stdev_multiplier = deep_multiplier
    method = deep_method

    dive_count = count_nonzero(unique(dives))

    printv(verbose, "\n" + "=" * 50 + "\n{}:".format(name))

    if iqr:

        nans_before = isnan(bb_raw).sum()
        bb_raw = outlier_bounds_iqr(bb_raw, multiplier=iqr)
        nans_after = isnan(bb_raw).sum()
        n_masked = nans_after - nans_before
        printv(
            verbose,
            "\tRemoving outliers with IQR * {}: {} obs".format(iqr, n_masked),
        )

    printv(
        verbose,
        "\tMask bad profiles based on deep values (depth={}m)".format(
            ref_depth),
    )
    bad_profiles = op.find_bad_profiles(dives, depth, bb_raw, ref_depth,
                                        stdev_multiplier, method)
    bb_raw[bad_profiles[0]] = nan

    bad_count = count_nonzero(bad_profiles[1])

    printv(
        verbose,
        "\tNumber of bad profiles = {}/{}".format(bad_count, dive_count),
    )
    printv(verbose, "\tZhang et al. (2009) correction")
    beta = ff.flo_scale_and_offset(bb_raw, dark_count, scale_factor)
    bbp = ff.flo_bback_total(beta, tempC, salt, theta, wavelength, xfactor)

    # This is from .Briggs et al. (2011)
    printv(verbose, "\tDark count correction")
    bbp = op.backscatter_dark_count(bbp, depth)

    printv(
        verbose,
        "\tSpike identification (spike window={})".format(spike_window),
    )
    baseline, spikes = despike(bbp, spike_window, spike_method="median")
    baseline = Series(baseline, name="bb{:.0f}".format(wavelength))

    baseline = transfer_nc_attrs(
        getframe(),
        var,
        baseline,
        name + "_baseline",
        units="units",
        standard_name="backscatter",
    )
    spikes = transfer_nc_attrs(
        getframe(),
        var,
        spikes,
        name + "_spikes",
        units="units",
        standard_name="backscatter",
    )

    if not return_figure:
        return baseline, spikes
    else:
        printv(verbose, "\tGenerating figure for despiking report")
        fig = despiking_report(dives, depth, bbp, baseline, spikes, name=name)

        return baseline, spikes, fig
Exemple #25
0
def calc_fluorescence(
    flr_raw,
    bbp,
    dives,
    depth,
    time,
    lat,
    lon,
    dark_count,
    scale_factor,
    spike_window=7,
    spike_method="median",
    night_day_group=True,
    sunrise_sunset_offset=1,
    profiles_ref_depth=300,
    deep_multiplier=1,
    deep_method="median",
    return_figure=False,
    verbose=True,
):
    r"""
    This function processes raw fluorescence and corrects for quenching using
    the Thomalla et al. (2018) approach [1]_.

    The following standard sequence is applied:

    1. find_bad_profiles  (e.g. high Fluorescence in > 300 m water signals
       bad profile)
    2. fluorescence_dark_count & scale factor  (i.e. factory correction)
    3. despike  (using Briggs et al. 2011 - rolling min--max)
    4. quenching_correction  (corrects for quenching with Thomalla et al. 2017)

    Parameters
    ----------
    flr_raw: np.array / pd.Series, dtype=float, shape=[n, ]
        The raw output of fluorescence data in instrument counts.
    bbp: np.array / pd.Series, dtype=float, shape=[n, ]
        The processed backscatter data from the less noisy channel, i.e. the
        one dataset with less spikes or bad profiles.
    dives: np.array / pd.Series, dtype=float, shape=[n, ]
        The dive count (round is down dives, 0.5 is up dives).
    depth: np.array / pd.Series, dtype=float, shape=[n, ]
        The depth array in metres.
    time: np.array / pd.Series, dtype=float, shape=[n, ]
        The date & time array in a numpy.datetime64 format.
    lat: np.array / pd.Series, dtype=float, shape=[n, ]
        The latitude of the glider position.
    lon: np.array / pd.Series, dtype=float, shape=[n, ]
        The longitude of the glider position.
    dark_count: float
        The dark count factory values from the calibration sheet.
    scale_factor: float
        The scale factor factory values from the calibration sheet.
    spike_window: int=7
        The window size over which to run the despiking method.
    spike_method: str=median
        Whether to use a rolling median or combination of min+max filter as
        the despiking method.
    night_day_group: bool=True
        If True, use preceding night otherwise use following night for
        calculating the flr:bbp ratio.
    sunrise_sunset_offset: int=1
        The delayed onset and recovery of quenching in hours [1]
        (assumes symmetrical).
    profiles_ref_depth: int=300
        The depth threshold for optics.find_bad_profiles below which the
        median or mean is calculated for identifying outliers.
    deep_multiplier: int=1
        The standard deviation multiplier for calculating outliers,
        i.e. mean ± std x [1].
    deep_method: str='median'
        Whether to use the deep median or deep mean to determine bad profiles
        for optics.find_bad_profiles.
    return_figure: bool=False
        If True, will return a figure object that shows before and after the
        quenching correction was applied.
    verbose: bool=True
        If True, will print the progress of the processing function.

    Returns
    -------
    baseline: array, dtype=float, shape=[n, ]
        The despiked + bad profile identified fluorescence that has not had
        the quenching correction applied.
    quench_corrected: array, dtype=float, shape=[n, ]
        The fluorescence data corrected for quenching.
    quench_layer: array, dtype=bool, shape=[n, ]
        The quenching layer as a mask.
    figs: object
        The figures reporting the despiking, bad profiles and quenching
        correction.

    References
    ----------
    .. [1] Thomalla, S. J., Moutier, W., Ryan-Keogh, T. J., Gregor, L.,
           & Schutt, J. (2018). An optimized method for correcting fluorescence
           quenching using optical backscattering on autonomous platforms.
           Limnology and Oceanography: Methods, 16(2), 132–144.
           https://doi.org/10.1002/lom3.10234

    """

    from numpy import array, count_nonzero, nan, unique

    from . import optics as op
    from .cleaning import despike, despiking_report

    var = flr_raw.copy()  # metdata preservation
    flr_raw = array(flr_raw)
    bbp = array(bbp)
    dives = array(dives)
    depth = array(depth)
    time = array(time)
    lat = array(lat)
    lon = array(lon)
    ref_depth = profiles_ref_depth
    stdev_multiplier = deep_multiplier
    method = deep_method

    printv(
        verbose,
        ("\n" + "=" * 50 + "\nFluorescence\n\tMask bad profiles based on "
         "deep values (ref depth={}m)").format(ref_depth),
    )
    bad_profiles = op.find_bad_profiles(dives, depth, flr_raw, ref_depth,
                                        stdev_multiplier, method)
    flr_raw[bad_profiles[0]] = nan

    bad_count = count_nonzero(bad_profiles[1])
    dive_count = count_nonzero(unique(dives))
    printv(
        verbose,
        "\tNumber of bad profiles = {}/{}".format(bad_count, dive_count),
    )

    printv(verbose, "\tDark count correction")
    flr_raw -= dark_count
    flr_dark = op.fluorescence_dark_count(flr_raw, depth)
    flr_dark[flr_dark < 0] = nan

    baseline, spikes = despike(flr_dark, spike_window, spike_method="median")

    printv(verbose, "\tQuenching correction")
    quench_corrected, quench_layer = op.quenching_correction(
        baseline,
        bbp,
        dives,
        depth,
        time,
        lat,
        lon,
        sunrise_sunset_offset=1,
        night_day_group=True,
    )

    printv(
        verbose,
        "\tSpike identification (spike window={})".format(spike_window),
    )

    baseline = transfer_nc_attrs(
        getframe(),
        var,
        baseline,
        "FLR_baseline",
        units="RFU",
        standard_name="",
    )
    quench_corrected = transfer_nc_attrs(
        getframe(),
        var,
        quench_corrected,
        "FLR_quench_corrected",
        units="RFU",
        standard_name="fluorescence",
    )
    quench_layer = transfer_nc_attrs(
        getframe(),
        var,
        quench_layer,
        "quenching_layer",
        units="",
        standard_name="",
        comment="",
    )

    if return_figure:
        printv(verbose,
               "\tGenerating figures for despiking and quenching report")
        figs = (despiking_report(
            dives,
            depth,
            flr_raw,
            baseline.data,
            spikes,
            name="Fluorescence",
        ), )
        figs += (op.quenching_report(
            baseline.data,
            quench_corrected.data,
            quench_layer,
            dives,
            depth,
        ), )
        return baseline, quench_corrected, quench_layer, figs
    else:
        return baseline, quench_corrected, quench_layer
Exemple #26
0
def calc_par(
    par_raw,
    dives,
    depth,
    time,
    scale_factor_wet_uEm2s,
    sensor_output_mV,
    curve_max_depth=80,
    verbose=True,
):
    """
    Calculates the theoretical PAR based on an exponential curve fit.

    The processing steps are:

    1. par_scaling  (factory cal sheet scaling)
    2. par_dark_count  (correct deep par values to 0 using 5th %)
    3. par_fill_surface  (return the theoretical curve of par based
       exponential fit)

    Parameters
    ----------
    All inputs must be ungridded np.ndarray or pd.Series data
    par_raw : array, dtype=float, shape=[n, ]
        raw PAR
    dives : array, dtype=float, shape=[n, ]
        the dive count (round is down dives, 0.5 up dives)
    depth : array, dtype=float, shape=[n, ]
        in metres
    time : array, dtype=float, shape=[n, ]
        as a np.datetime64 array

    Returns
    -------
    par_filled : array, dtype=float, shape=[n, ]
        PAR with filled surface values.
    """

    from numpy import array

    from . import optics as op

    var = par_raw.copy()  # metdata presrevation
    par_raw = array(par_raw)
    dives = array(dives)
    depth = array(depth)
    time = array(time)

    printv(verbose, "\n" + "=" * 50 + "\nPAR\n\tDark correction")

    # dark correction for par
    par_scaled = op.par_scaling(par_raw, scale_factor_wet_uEm2s,
                                sensor_output_mV)
    par_dark = op.par_dark_count(par_scaled, dives, depth, time)
    printv(verbose, "\tFitting exponential curve to data")
    par_filled = op.par_fill_surface(par_dark,
                                     dives,
                                     depth,
                                     max_curve_depth=curve_max_depth)
    par_filled[par_filled < 0] = 0

    attrs = dict(
        standard_name="photosynthetically_available_radiation",
        units="uE/m2/s2",
        comment="",
    )
    par_filled = transfer_nc_attrs(getframe(), var, par_filled,
                                   "PAR_processed", **attrs)
    par_filled = par_filled.fillna(0)

    return par_filled
Exemple #27
0
def interp_obj(  # noqa: C901
    x,
    y,
    z,
    xi,
    yi,
    partial_sill=0.1,
    nugget=0.01,
    lenscale_x=20,
    lenscale_y=20,
    detrend=True,
    max_points_per_quad=55,
    min_points_per_quad=8,
    return_error=False,
    n_cpus=None,
    verbose=True,
    parallel_chunk_size=512,
):
    """
    Performs objective interpolation (or Kriging) of a 2D field.

    The objective interpolation breaks the problem into smaller fields by
    iteratively breaking the problem into quadrants. Each quadrant is then
    interpolated (also using intformation from its neighbours).
    The interpolation is inverse distance weighted using a gaussian kernel (or
    radial basis function). The kernel has a width of 12 hours if the
    x-dimension is time, otherwise scaled by the x-variable unit. The kernel
    is in meters assuming that depth is the y-coord. This can be changed with
    keyword arguements. An error estimate can also be calculated if requested.

    The following link provides good background on the Kriging procedure:
    http://desktop.arcgis.com/en/arcmap/10.3/tools/3d-analyst-toolbox/how-kriging-works.htm


    Parameters
    ----------
    x : np.array | pd.series
        horizontal coordinates of the input data (same length as y, z)
        can be types float or datetime64
    y : np.array | pd.series
        vertical coordinates of the input data (same length as x, z)
    z : np.array | pd.series
        values to be interoplated (same length as x, y)
    xi : np.array
        horizontal coordinates of the interpolation grid (must be 1D)
        can be types float or datetime64
    yi : np.array | pd.series
        vertical coordinates of the interpolation grid (must be 1D)
    nugget : float [0.01]
        the error estimate due to sampling inaccuracy also known as the nugget
        in Kriging literature. This should be taken from the semivariogram
    partial_sill : float [0.1]
        represents the spatial covariance of the variable being interpolated.
        Should be estimated from the semivariogram. See Kriging literature for
        more information
    lenscale_x : float [20]
        horizontal length scale horizontal coordinate variable
        If dtype(x) is np.datetime64 (any format) then lenscale units is in
        hours. Otherwise if type(x).
    lenscale_y : float [20]
        horizontal length scale horizontal coordinate variable.
    max_points_per_quad : int [55]
        the data is divided into quadrants using a quadtree approach -
        iteratively dividing data into smaller quadrants using x and y
        coordinates. The algorithm stops splitting the data into quadrants
        when there are no quadrants exceeding the limit set with
        max_points_per_quad is. This is done to reduce the computational
        cost of the function.
    min_points_per_quad : int [8]
        sets the minimum number of points allowed in a neighbouring quadrant
        when creating the interpolation function for a particular quadrant. If
        the number of points is less than specified, the algorithm looks for
        neighbours of the neighbours to include more points in the
        interpolation.
    n_cpus : int [n - 1]
        use parallel computing. The quadrant calculations are spread across
        CPUs. Must be positive and > 0
    parallel_chunk_size : int [512]
        the number of leaves that will be processed in parallel in one go. This
        is a memory saving feature. If your dataset is very large, parallel
        processing will use up a lot of memmory. Increasing the chunk size
        increases the memory requirements.
    verbose : bool [True]
        will print out information about the interpolation

    Returns
    -------
    xr.Dataset
        Contains the following arrays:
        - z: interpolated values
        - variance: error estimate of the interpolation
        - weights: the quadtree weighting used to calculate the estimates
        - nugget: the nugget used in the interpolation
        - partial_sill: value used for the interpolation

    Note
    ----
    The data may have semi-discrete artifacts. This is also present in the
    MATLAB output.

    Example
    -------
    >>> xi = np.arange(time.values.min(), time.values.max(), 30,
                       dtype='datetime64[m]')
    >>> yi = np.linspace(depth.min(), depth.max(), 1.)
    >>> interpolated = gt.mapping.interp_obj(
            time, depth, var, xi, yi,
            nugget=.0035, partial_sill=0.02,
            lenscale_x=80, lenscale_y=80,
            detrend=True)

    """
    def get_detrend_model(x, y, z):
        model = linear_model.LinearRegression()
        model.fit(np.c_[x, y], z)

        return model

    import multiprocessing as mp
    from functools import partial
    from time import perf_counter as timer

    import xarray as xr
    from sklearn import linear_model

    if (n_cpus is None) | (n_cpus == 0):
        n_cpus = mp.cpu_count() - 1

    if verbose:
        print("Starting Interpolation with quadtree optimal interpolation")
        print("----------------------------------------------------------")
        print("\nPreparing for interpolations:")

    zvar = z.copy()
    yvar = y.copy()
    xvar = x.copy()

    is_time_x = np.issubdtype(x.dtype, np.datetime64)
    is_time_xi = np.issubdtype(xi.dtype, np.datetime64)
    ymessage = "y-coordinates are not the same type (x={}, xi={})".format(
        y.dtype, yi.dtype)
    xmessage = "x-coordinates are not the same type (x={}, xi={})".format(
        x.dtype, xi.dtype)
    assert y.dtype == yi.dtype, ymessage
    assert (is_time_x + is_time_xi) != 1, xmessage

    if is_time_x:  # convert data to hours
        if verbose:
            print("\tTime conversion")
        x = np.array(x).astype("datetime64[s]").astype(float) / 3600
        xi = np.array(xi).astype("datetime64[s]").astype(float) / 3600
        units_x = "hrs"
    else:
        units_x = ""

    if verbose:
        print("\tFinding and removing nans")
    nans = np.isnan(z) | np.isnan(x) | np.isnan(y)
    x, y, z = [np.array(a)[~nans] for a in [x, y, z]]

    # detrend data using linear regression
    if detrend:
        if verbose:
            print("\tRemoving data trend with linear regression")
        model = get_detrend_model(x, y, z)
        z_hat = model.predict(np.c_[x, y])
        z -= z_hat
    else:
        if verbose:
            print("\tRemoving data mean")
        z_avg = np.nanmean(z)
        z -= z_avg

    if verbose:
        print("\tBuilding QuadTree")
    quad_tree = QuadTree(np.c_[x, y], max_points_per_quad=max_points_per_quad)
    xx, yy = np.array(np.meshgrid(xi, yi)).reshape(2, -1)
    leaves = quad_tree.leaves
    n = len(leaves)

    interp_info = "\n".join([
        "\nInterpolation information:",
        "\tbasis points:        {}".format(x.size),
        "\tinterp grid:         {}, {}".format(xi.size, yi.size),
        "\tmax_points_per_quad: {}".format(max_points_per_quad),
        "\tmin_points_per_quad: {}".format(min_points_per_quad),
        "\tnumber of quads:     {}".format(n),
        "\tdetrend_method:      {}".format(
            "linear_regression" if detrend else "mean"),
        "\tpartial_sill:        {}".format(partial_sill),
        "\tnugget:              {}".format(nugget),
        "\tlengthscales:        X = {} {}".format(lenscale_x, units_x),
        "\t                     Y = {} m".format(lenscale_y),
    ])

    if verbose:
        print(interp_info)

    pool = mp.Pool(n_cpus)
    props = dict(
        z=z,
        xi=xx,
        yi=yy,
        nugget=nugget,
        partial_sill=partial_sill,
        lenscale_x=lenscale_x,
        lenscale_y=lenscale_y,
        min_points_per_quad=min_points_per_quad,
        return_error=return_error,
        verbose=verbose,
    )

    func = partial(interp_leaf, **props)

    # predifining matricies for interpolation
    errors = np.ndarray(xx.size) * 0
    weights = np.ndarray(xx.size) * 0
    variable = np.ndarray(xx.size) * 0
    # creating a timer to inform the user
    t0 = timer()
    # getting the index used to split the data up into chunks
    chunk_idx = np.arange(0, n, parallel_chunk_size, dtype=int)
    n_chunks = chunk_idx.size
    if verbose:
        print("\nProcessing interpolation in {} parts over {} CPUs:".format(
            n_chunks, n_cpus))
    for c, i0 in enumerate(chunk_idx):
        i1 = i0 + parallel_chunk_size
        chunk_leaves = leaves[i0:i1]
        # do the parallel processing
        chunk_output = pool.map(func, chunk_leaves)
        # add the parallel chunk output to the output arrays
        for w, zi, er, ii in chunk_output:
            weights[ii] += w
            variable[ii] += zi
            errors[ii] += er
        # create info for the user
        t1 = timer()
        if verbose:
            print("\tchunk {}/{} completed in {:.0f}s".format(
                c + 1, n_chunks, t1 - t0))
        t0 = timer()

    # completing the interpolation
    if verbose:
        print("\nFinishing off interoplation")
    if detrend:
        if verbose:
            print("\tAdding back the trend")
        zi = (variable / weights) + model.predict(np.c_[xx, yy])
    else:
        if verbose:
            print("\tAdding back the average")
        zi = (variable / weights) + z_avg
    errors = errors / weights
    if verbose & is_time_x:
        print("\tTime conversion")
    xi = (xi * 3600).astype("datetime64[s]") if is_time_x else xi

    if verbose:
        print("\tCreating xarray dataset for output")
    xds = xr.Dataset(
        attrs={
            "description": (
                "interpolation output from the GliderTools.interp_obj"
                "function. Print out mapping_info for more details"),
            "mapping_info":
            interp_info,
        })

    props = dict(dims=["y", "x"], coords={"y": yi, "x": xi})
    xds["z"] = xr.DataArray(zi.reshape(yi.size, xi.size), **props)
    xds["weights"] = xr.DataArray(weights.reshape(yi.size, xi.size), **props)
    xds["variance"] = xr.DataArray(errors.reshape(yi.size, xi.size), **props)
    xds.attrs["nugget"] = nugget
    xds.attrs["partial_sill"] = partial_sill

    dummy = transfer_nc_attrs(getframe(), zvar, zvar, "_interp")
    if isinstance(zvar, xr.DataArray):
        xds["z"].attrs = dummy.attrs
        # xds = xds.rename({'z': dummy.name})

    if isinstance(yvar, xr.DataArray):
        xds["y"].attrs = yvar.attrs
        xds = xds.rename({"y": yvar.name})

    if isinstance(xvar, xr.DataArray):
        xds["x"].attrs = xvar.attrs
        xds = xds.rename({"x": xvar.name})

    return xds
Exemple #28
0
def grid_data(
    x,
    y,
    var,
    bins=None,
    how='mean',
    interp_lim=6,
    verbose=True,
    return_xarray=True,
):
    """
    Grids the input variable to bins for depth/dens (y) and time/dive (x).
    The bins can be specified to be non-uniform to adapt to variable sampling
    intervals of the profile. It is useful to use the ``gt.plot.bin_size``
    function to identify the sampling intervals. The bins are averaged (mean)
    by default but can also be the ``median, std, count``,

    Parameters
    ----------
    x : np.array, dtype=float, shape=[n, ]
        The horizontal values by which to bin need to be in a psudeo discrete
        format already. Dive number or ``time_average_per_dive`` are the
        standard inputs for this variable. Has ``p`` unique values.
    y : np.array, dtype=float, shape=[n, ]
        The vertical values that will be binned; typically depth, but can also
        be density or any other variable.
    bins : np.array, dtype=float; shape=[q, ], default=[0 : 1 : max_depth ]
        Define the bin edges for y with this function. If not defined, defaults
        to one meter bins.
    how : str, defualt='mean'
        the string form of a function that can be applied to pandas.Groupby
        objects. These include ``mean, median, std, count``.
    interp_lim : int, default=6
        sets the maximum extent to which NaNs will be filled.

    Returns
    -------
    glider_section : xarray.DataArray, shape=[p, q]
        A 2D section in the format specified by ``ax_xarray`` input.

    Raises
    ------
    Userwarning
        Triggers when ``x`` does not have discrete values.
    """
    from pandas import cut, Series
    from xarray import DataArray
    from numpy import array, c_, unique, diff

    xvar, yvar = x.copy(), y.copy()
    z = Series(var)
    y = array(y)
    x = array(x)

    u = unique(x).size
    s = x.size
    if (u / s) > 0.2:
        raise UserWarning(
            'The x input array must be psuedo discrete (dives or dive_time). '
            '{:.0f}% of x is unique (max 20% unique)'.format(u / s * 100))

    chunk_depth = 50
    optimal_bins, avg_sample_freq = get_optimal_bins(y, chunk_depth)
    if bins is None:
        bins = optimal_bins

    # warning if bin average is smaller than average bin size
    if verbose:
        avg_bin_size = diff(bins).mean()
        print(('Mean bin size = {:.2f}\n'
               'Mean depth binned ({} m) vertical sampling frequency = {:.2f}'
               ).format(avg_bin_size, chunk_depth, avg_sample_freq))

    labels = c_[bins[:-1], bins[1:]].mean(axis=1)
    bins = cut(y, bins, labels=labels)

    grp = Series(z).groupby([x, bins])
    grp_agg = getattr(grp, how)()
    gridded = grp_agg.unstack(level=0)
    gridded = gridded.reindex(labels.astype(float))

    if interp_lim > 0:
        gridded = gridded.interpolate(limit=interp_lim).bfill(limit=interp_lim)

    if not return_xarray:
        return gridded

    if return_xarray:
        dummy = transfer_nc_attrs(getframe(), var, var, '_vert_binned')

        xda = gridded.stack().to_xarray()
        if isinstance(var, DataArray):
            xda.attrs = dummy.attrs
            xda.name = dummy.name

        if isinstance(yvar, DataArray):
            y = xda.dims[0]
            xda[y].attrs = yvar.attrs
            xda = xda.rename({y: yvar.name})

        if isinstance(xvar, DataArray):
            x = xda.dims[1]
            xda[x].attrs = xvar.attrs
            xda = xda.rename({x: xvar.name})

        return xda
Exemple #29
0
def grid_data(
    x,
    y,
    var,
    bins=None,
    how="mean",
    interp_lim=6,
    verbose=True,
    return_xarray=True,
):
    """
    Grids the input variable to bins for depth/dens (y) and time/dive (x).
    The bins can be specified to be non-uniform to adapt to variable sampling
    intervals of the profile. It is useful to use the ``gt.plot.bin_size``
    function to identify the sampling intervals. The bins are averaged (mean)
    by default but can also be the ``median, std, count``,

    Parameters
    ----------
    x : np.array, dtype=float, shape=[n, ]
        The horizontal values by which to bin need to be in a psudeo discrete
        format already. Dive number or ``time_average_per_dive`` are the
        standard inputs for this variable. Has ``p`` unique values.
    y : np.array, dtype=float, shape=[n, ]
        The vertical values that will be binned; typically depth, but can also
        be density or any other variable.
    bins : np.array, dtype=float; shape=[q, ], default=[0 : 1 : max_depth ]
        Define the bin edges for y with this function. If not defined, defaults
        to one meter bins.
    how : str, defualt='mean'
        the string form of a function that can be applied to pandas.Groupby
        objects. These include ``mean, median, std, count``.
    interp_lim : int, default=6
        sets the maximum extent to which NaNs will be filled.

    Returns
    -------
    glider_section : xarray.DataArray, shape=[p, q]
        A 2D section in the format specified by ``ax_xarray`` input.

    Raises
    ------
    Userwarning
        Triggers when ``x`` does not have discrete values.
    """
    from numpy import array, c_, diff, unique
    from pandas import Series, cut
    from xarray import DataArray

    xvar, yvar = x.copy(), y.copy()
    z = Series(var)
    y = array(y)
    x = array(x)

    u = unique(x).size
    s = x.size
    if (u / s) > 0.2:
        raise UserWarning(
            "The x input array must be psuedo discrete (dives or dive_time). "
            "{:.0f}% of x is unique (max 20% unique)".format(u / s * 100))

    chunk_depth = 50
    # -DB this might not work if the user uses anything other than depth, example
    # density. Chunk_depth would in that case apply to density, which will
    # probably have a range that is much smaller than 50.
    optimal_bins, avg_sample_freq = get_optimal_bins(y, chunk_depth)
    if bins is None:
        bins = optimal_bins

    # warning if bin average is smaller than average bin size
    # -DB this is not being raised as a warning. Instead just seems like useful
    # information conveyed to user. Further none of this works out if y is not
    # depth, since avg_sample freq will not make sense otherwise.
    if verbose:
        avg_bin_size = diff(bins).mean()
        print(("Mean bin size = {:.2f}\n"
               "Mean depth binned ({} m) vertical sampling frequency = {:.2f}"
               ).format(avg_bin_size, chunk_depth, avg_sample_freq))

    labels = c_[bins[:-1],
                bins[1:]].mean(axis=1)  # -DB creates the mean bin values
    bins = cut(y, bins, labels=labels)
    # -DB creates a new variable where instead of variable the bin category
    # is mentioned (sort of like a discretization)

    grp = Series(z).groupby([x, bins
                             ])  # -DB put z into the many bins (like 2D hist)
    grp_agg = getattr(
        grp, how)()  # -DB basically does grp.how() or in this case grp.mean()
    gridded = grp_agg.unstack(level=0)
    gridded = gridded.reindex(labels.astype(float))

    if interp_lim > 0:
        gridded = gridded.interpolate(limit=interp_lim).bfill(limit=interp_lim)

    if not return_xarray:
        return gridded

    if return_xarray:
        dummy = transfer_nc_attrs(getframe(), var, var, "_vert_binned")

        xda = DataArray(gridded)
        if isinstance(var, DataArray):
            xda.attrs = dummy.attrs
            xda.name = dummy.name

        if isinstance(yvar, DataArray):
            y = xda.dims[0]
            xda[y].attrs = yvar.attrs
            xda = xda.rename({y: yvar.name})

        if isinstance(xvar, DataArray):
            x = xda.dims[1]
            xda[x].attrs = xvar.attrs
            xda = xda.rename({x: xvar.name})

        return xda
Exemple #30
0
def calc_oxygen(
    o2raw,
    pressure,
    salinity,
    temperature,
    auto_conversion=True,
    spike_window=7,
    spike_method="median",
    savitzky_golay_window=0,
    savitzky_golay_order=2,
    verbose=True,
):
    """
    This function processes oxygen.

    It is assumed that either mL/L or umol/kg are passed as input.
    The units are automatically detected by looking at the mean ratio.
    Below are some conversions to help with the Oxygen units:

    >>> µmol/l > µmol/kg * 1.025
        µmol/l > ml/l * 44.66
        µmol/l > mg/l * 31.25

    Parameters
    ----------
    o2raw : array, dtype=float, shape=[n, ]
        raw oxygen without unit conversion
    pressure : array, dtype=float, shape=[n, ]
    salinity : array, dtype=float, shape=[n, ]
    temperature : array, dtype=float, shape=[n, ]
    conversion : bool=True
        tries to determine the unit of oxygen based on ``o2raw`` values.
        The user needs to do a manual conversion if False
    spike_window : int=7
        rolling window size to apply for the ``cleaning.despike`` function.
    spike_method : string='median'
        can be 'median' or 'minmax'. see ``cleaning.despike`` for more info.
    savitzky_golay_window : int=0
        rolling window size for ``cleaning.savitzky_golay`` function
    savitzky_golay_order : int=2
        polynomial order for ``cleaning.savitzky_golay`` function
    verbose : bool=True

    Returns
    -------
    o2mll : array, dtype=float, shape=[n, ]
        oxygen concentration in mL/L (if unit auto_conversion is set True)
    o2pct : array, dtype=float, shape=[n, ]
        theoretical oxygen saturation percentage
    o2aou : array, dtype=float, shape=[n, ]
        aparent oxygen utilisation based on measured oxygen and oxygen
        saturation.

    Note
    ----
    To Do: Oxygen processing should have its own section to be consistent

    """

    import seawater as sw
    from numpy import abs, array, c_, isnan, median, ones
    from pandas import Series

    from .cleaning import despike, outlier_bounds_iqr, savitzky_golay

    var = o2raw.copy()  # metdata preservation
    if isinstance(o2raw, Series):
        name = o2raw.name
    else:
        name = "Oxygen"
    o2raw = array(o2raw)
    pressure = array(pressure)
    temperature = array(temperature)
    salinity = array(salinity)

    if spike_window:
        o2raw, _ = despike(o2raw, spike_window, spike_method)
        printv(
            verbose,
            "\n" + "=" * 50 + "\n{}:\n"
            "\tSmoothing data with despiking algorithm:\n\t"
            "    spike identification (spike window={})"
            "".format(name, spike_window),
        )

    if savitzky_golay_window:
        printv(
            verbose,
            ("\tSmoothing with Savitzky-Golay filter "
             "(window={}, order={})").format(savitzky_golay_window,
                                             savitzky_golay_order),
        )
        o2raw = savitzky_golay(o2raw, savitzky_golay_window,
                               savitzky_golay_order)

    o2sat = sw.satO2(salinity, temperature)
    density = sw.dens(salinity, temperature, pressure)

    if auto_conversion:
        # use linear regression to determine the oxygen unit
        # raw surface (<10m) O2 is regressed theoretical saturation
        # the slope of the regression will be indicative of the
        # units as theoretical saturation is always in mL/L
        # Use the min difference between the slope and known
        # conversion factors to estimate the appropriate conversion.

        # clean the data first with basic cleaning
        surf = (pressure < 20) & ~isnan(o2raw) & ~isnan(o2sat)
        # prepare the data for linear regression
        Y = o2raw[surf].copy()
        X = c_[ones(surf.sum()), o2sat[surf]]
        # removing outliers accodring to IQR
        ll, ul = outlier_bounds_iqr(Y, multiplier=1.5)
        m = (Y > ll) & (Y < ul)
        ratios = Y[m] / X[m, 1]

        # compare the slopes
        observed_ratio = median(ratios)
        # the theoretical values have been divided by 1.025 to account for
        # the density of seawater
        theoretic_ratio = array([1, 43.5])
        ratio_diffs = abs(observed_ratio - theoretic_ratio)
        # catch if the difference is too big
        if ratio_diffs.min() > 10:
            printv(
                verbose,
                ("Oxygen unit could not be estimated automatically. "
                 "Do the unit conversion on the raw data before "
                 "passing it to the function. \n"
                 "Below is some info to help you\n"
                 "    µmol/l > µmol/kg * 1.025\n"
                 "    µmol/l > ml/l * 44.66\n"
                 "    µmol/l > mg/l * 31.25"),
            )
        # otherwise do the conversion
        else:
            unit_idx = ratio_diffs.argmin()
            if unit_idx == 0:
                unit = "mL/L"
                o2mll = array(o2raw)
            elif unit_idx == 2:
                unit = "mg/L"
                o2mll = array(o2raw) / 31.25 * (density / 1000)
            elif unit_idx == 1:
                unit = "umol/kg"
                o2mll = array(o2raw) / 44.66 * (density / 1000)
            else:
                printv(verbose, "Difference is {}".format(ratio_diffs))
            printv(verbose, "\tUnits automatically detected {}".format(unit))
            if ratio_diffs.min() > 5:
                print("\tWARNING: Confirm units mannually as near the "
                      "confidence threshold")
        o2aou = o2sat - o2mll
        o2pct = o2mll / o2sat * 100

        o2mll = transfer_nc_attrs(
            getframe(),
            var,
            o2mll,
            "o2mll",
            units="mL/L",
            comment="",
            standard_name="dissolved_oxygen",
        )
        o2aou = transfer_nc_attrs(
            getframe(),
            var,
            o2mll,
            "o2aou",
            units="mL/L",
            comment="",
            standard_name="aparent_oxygen_utilisation",
        )
        o2pct = transfer_nc_attrs(
            getframe(),
            var,
            o2mll,
            "o2pct",
            units="percent",
            comment="",
            standard_name="theoretical_oxgen_saturation",
        )

        return o2mll, o2pct, o2aou

    else:
        print("No oxygen conversion applied - user "
              "must impliment before or after running "
              "the cleaning functions.")