def within_n_std(df, n=3): """Asserts that every value is within ``n`` standard deviations of its column's mean. Args: df (pd.DataFrame): Any pd.DataFrame. n (int): Number of standard deviations from the mean. Returns: Original `df`. """ means = df.mean() stds = df.std() inliers = (np.abs(df[means.index] - means) < n * stds) if not np.all(inliers): msg = bad_locations(~inliers) raise AssertionError(msg) return df
def has_no_infs(df, columns=None): """Asserts that there are no np.infs in `df`. Args: df (pd.DataFrame): Any pd.DataFrame. columns (list): A subset of columns to check for np.nans. Returns: Original `df`. """ if columns is None: columns = df.columns try: assert not df[columns].isin([np.inf]).values.any() except AssertionError as e: missing = df[columns].isin([np.inf]) msg = bad_locations(missing) e.args = msg raise return df
def has_no_x(df, values=None, columns=None): """Asserts that there are no user-specified `values` in `df`'s `columns`. Args: df (pd.DataFrame): Any pd.DataFrame. values (list): A list of values to check for in the pd.DataFrame. columns (list): A subset of columns to check for `values`. Returns: Original `df`. """ values = values if values is not None else [] columns = columns if columns is not None else df.columns try: assert not df[columns].isin(values).values.any() except AssertionError as e: missing = df[columns].isin(values) msg = bad_locations(missing) e.args = msg raise return df
def is_monotonic(df, items=None, increasing=None, strict=False): """Asserts that the `df` is monotonic. Args: df (pd.DataFrame): Any pd.DataFrame. items (dict): Mapping of columns to conditions (increasing, strict) E.g. {'col_a': (None, False), 'col_b': (None, False)} increasing (bool, None): None checks for either increasing or decreasing monotonicity. strict (bool): Whether the comparison should be strict, meaning two values in a row being equal should fail. Returns: Original `df`. Examples: The following check will pass, since each column matches its monotonicity requirements: >>> import bulwark.checks as ck >>> import pandas as pd >>> df = pd.DataFrame({"incr_strict": [1, 2, 3, 4], ... "incr_not_strict": [1, 2, 2, 3], ... "decr_strict": [4, 3, 2, 1], ... "decr_not_strict": [3, 2, 2, 1]}) >>> items = { ... "incr_strict": (True, True), ... "incr_not_strict": (True, False), ... "decr_strict": (False, True), ... "decr_not_strict": (False, False) ... } >>> ck.is_monotonic(df, items=items) incr_strict incr_not_strict decr_strict decr_not_strict 0 1 1 4 3 1 2 2 3 2 2 3 2 2 2 3 4 3 1 1 All of the same cases will also pass if increasing=None, since only one of increasing or decreasing monotonicity is then required: >>> ck.is_monotonic(df, increasing=None, strict=False) incr_strict incr_not_strict decr_strict decr_not_strict 0 1 1 4 3 1 2 2 3 2 2 3 2 2 2 3 4 3 1 1 The following check will fail, displaying a list of which (row, column)s caused the issue: >>> df2 = pd.DataFrame({'not_monotonic': [1, 2, 3, 2]}) >>> ck.is_monotonic(df2, increasing=True, strict=False) Traceback (most recent call last): ... AssertionError: [(3, 'not_monotonic')] """ if items is None: items = {col: (increasing, strict) for col in df} operator_choices = { # key = (increasing, strict) (True, True): operator.gt, (False, True): operator.lt, (True, False): operator.ge, (False, False): operator.le, (None, True): (operator.gt, operator.lt), (None, False): (operator.ge, operator.le), } bad = pd.DataFrame() for col, (increasing, strict) in items.items(): ser_diff = df[col].diff().dropna() op = operator_choices[(increasing, strict)] if increasing is None: ser_diff_incr = op[0](ser_diff, 0) ser_diff_dec = op[1](ser_diff, 0) if not ser_diff_incr.all() | ser_diff_dec.all(): bad[ser_diff.name] = ~ser_diff_incr | ~ser_diff_dec else: bad[ser_diff.name] = ~op(ser_diff, 0) if np.any(bad): msg = bad_locations(bad) raise AssertionError(msg) return df