Exemple #1
0
def within_n_std(df, n=3):
    """Asserts that every value is within ``n`` standard deviations of its column's mean.

    Args:
        df (pd.DataFrame): Any pd.DataFrame.
        n (int): Number of standard deviations from the mean.

    Returns:
        Original `df`.

    """
    means = df.mean()
    stds = df.std()
    inliers = (np.abs(df[means.index] - means) < n * stds)
    if not np.all(inliers):
        msg = bad_locations(~inliers)
        raise AssertionError(msg)
    return df
Exemple #2
0
def has_no_infs(df, columns=None):
    """Asserts that there are no np.infs in `df`.

    Args:
        df (pd.DataFrame): Any pd.DataFrame.
        columns (list): A subset of columns to check for np.nans.

    Returns:
        Original `df`.

    """
    if columns is None:
        columns = df.columns
    try:
        assert not df[columns].isin([np.inf]).values.any()
    except AssertionError as e:
        missing = df[columns].isin([np.inf])
        msg = bad_locations(missing)
        e.args = msg
        raise
    return df
Exemple #3
0
def has_no_x(df, values=None, columns=None):
    """Asserts that there are no user-specified `values` in `df`'s `columns`.

    Args:
        df (pd.DataFrame): Any pd.DataFrame.
        values (list): A list of values to check for in the pd.DataFrame.
        columns (list): A subset of columns to check for `values`.

    Returns:
        Original `df`.

    """
    values = values if values is not None else []
    columns = columns if columns is not None else df.columns

    try:
        assert not df[columns].isin(values).values.any()
    except AssertionError as e:
        missing = df[columns].isin(values)
        msg = bad_locations(missing)
        e.args = msg
        raise
    return df
Exemple #4
0
def is_monotonic(df, items=None, increasing=None, strict=False):
    """Asserts that the `df` is monotonic.

    Args:
        df (pd.DataFrame): Any pd.DataFrame.
        items (dict): Mapping of columns to conditions (increasing, strict)
                      E.g. {'col_a': (None, False), 'col_b': (None, False)}
        increasing (bool, None): None checks for either increasing or decreasing monotonicity.
        strict (bool): Whether the comparison should be strict,
                       meaning two values in a row being equal should fail.

    Returns:
        Original `df`.

    Examples:
        The following check will pass, since each column matches its monotonicity requirements:

        >>> import bulwark.checks as ck
        >>> import pandas as pd
        >>> df = pd.DataFrame({"incr_strict": [1, 2, 3, 4],
        ...                    "incr_not_strict": [1, 2, 2, 3],
        ...                    "decr_strict": [4, 3, 2, 1],
        ...                    "decr_not_strict": [3, 2, 2, 1]})
        >>> items = {
        ...     "incr_strict": (True, True),
        ...     "incr_not_strict": (True, False),
        ...     "decr_strict": (False, True),
        ...     "decr_not_strict": (False, False)
        ... }
        >>> ck.is_monotonic(df, items=items)
           incr_strict  incr_not_strict  decr_strict  decr_not_strict
        0            1                1            4                3
        1            2                2            3                2
        2            3                2            2                2
        3            4                3            1                1

        All of the same cases will also pass if increasing=None,
        since only one of increasing or decreasing monotonicity is then required:

        >>> ck.is_monotonic(df, increasing=None, strict=False)
           incr_strict  incr_not_strict  decr_strict  decr_not_strict
        0            1                1            4                3
        1            2                2            3                2
        2            3                2            2                2
        3            4                3            1                1

        The following check will fail,
        displaying a list of which (row, column)s caused the issue:

        >>> df2 = pd.DataFrame({'not_monotonic': [1, 2, 3, 2]})
        >>> ck.is_monotonic(df2, increasing=True, strict=False)
        Traceback (most recent call last):
            ...
        AssertionError: [(3, 'not_monotonic')]

    """
    if items is None:
        items = {col: (increasing, strict) for col in df}

    operator_choices = {
        # key = (increasing, strict)
        (True, True): operator.gt,
        (False, True): operator.lt,
        (True, False): operator.ge,
        (False, False): operator.le,
        (None, True): (operator.gt, operator.lt),
        (None, False): (operator.ge, operator.le),
    }

    bad = pd.DataFrame()
    for col, (increasing, strict) in items.items():
        ser_diff = df[col].diff().dropna()
        op = operator_choices[(increasing, strict)]

        if increasing is None:
            ser_diff_incr = op[0](ser_diff, 0)
            ser_diff_dec = op[1](ser_diff, 0)
            if not ser_diff_incr.all() | ser_diff_dec.all():
                bad[ser_diff.name] = ~ser_diff_incr | ~ser_diff_dec
        else:
            bad[ser_diff.name] = ~op(ser_diff, 0)

    if np.any(bad):
        msg = bad_locations(bad)
        raise AssertionError(msg)

    return df