def test_incremental_weighted_mean_and_variance(mean, var, weight_loc, weight_scale, rng): # Testing of correctness and numerical stability def _assert(X, sample_weight, expected_mean, expected_var): n = X.shape[0] for chunk_size in [1, n//10 + 1, n//4 + 1, n//2 + 1, n]: last_mean, last_weight_sum, last_var = 0, 0, 0 for batch in gen_batches(n, chunk_size): last_mean, last_var, last_weight_sum = \ _incremental_weighted_mean_and_var(X[batch], sample_weight[batch], last_mean, last_var, last_weight_sum) assert_allclose(last_mean, expected_mean) assert_allclose(last_var, expected_var, atol=1e-6) size = (100, 20) weight = rng.normal(loc=weight_loc, scale=weight_scale, size=size[0]) # Compare to weighted average: np.average X = rng.normal(loc=mean, scale=var, size=size) expected_mean = _safe_accumulator_op(np.average, X, weights=weight, axis=0) expected_var = _safe_accumulator_op( np.average, (X - expected_mean) ** 2, weights=weight, axis=0) _assert(X, weight, expected_mean, expected_var) # Compare to unweighted mean: np.mean X = rng.normal(loc=mean, scale=var, size=size) ones_weight = np.ones(size[0]) expected_mean = _safe_accumulator_op(np.mean, X, axis=0) expected_var = _safe_accumulator_op(np.var, X, axis=0) _assert(X, ones_weight, expected_mean, expected_var)
def _daal_assert_all_finite(X, allow_nan=False, msg_dtype=None): """Like assert_all_finite, but only for ndarray.""" # validation is also imported in extmath from sklearn.utils.extmath import _safe_accumulator_op if _get_config()['assume_finite']: return is_df = is_DataFrame(X) num_of_types = get_number_of_types(X) # if X is heterogeneous pandas.DataFrame then # covert it to a list of arrays if is_df and num_of_types > 1: lst = [] for idx in X: arr = X[idx].to_numpy() lst.append(arr if arr.flags['C_CONTIGUOUS'] else np.ascontiguousarray(arr)) else: X = np.asanyarray(X) is_df = False dt = np.dtype(get_dtype(X)) is_float = dt.kind in 'fc' msg_err = "Input contains {} or a value too large for {!r}." type_err = 'infinity' if allow_nan else 'NaN, infinity' err = msg_err.format(type_err, msg_dtype if msg_dtype is not None else dt) if (X.ndim in [1, 2] and not np.any(np.equal(X.shape, 0)) and dt in [np.float32, np.float64] ): if X.ndim == 1: X = X.reshape((-1, 1)) x_for_daal = lst if is_df and num_of_types > 1 else X if dt == np.float64: if not d4p.daal_assert_all_finite(x_for_daal, allow_nan, 0): raise ValueError(err) elif dt == np.float32: if not d4p.daal_assert_all_finite(x_for_daal, allow_nan, 1): raise ValueError(err) # First try an O(n) time, O(1) space solution for the common case that # everything is finite; fall back to O(n) space np.isfinite to prevent # false positives from overflow in sum method. The sum is also calculated # safely to reduce dtype induced overflows. elif is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))): pass elif is_float: if (allow_nan and np.isinf(X).any() or not allow_nan and not np.isfinite(X).all()): raise ValueError(err) # for object dtype data, we only check for NaNs (GH-13254) elif dt == np.dtype('object') and not allow_nan: if _object_dtype_isnan(X).any(): raise ValueError("Input contains NaN")
def _daal_assert_all_finite(X, allow_nan=False, msg_dtype=None): """Like assert_all_finite, but only for ndarray.""" # validation is also imported in extmath from sklearn.utils.extmath import _safe_accumulator_op if _get_config()['assume_finite']: return X = np.asanyarray(X) dt = X.dtype is_float = dt.kind in 'fc' msg_err = "Input contains {} or a value too large for {!r}." type_err = 'infinity' if allow_nan else 'NaN, infinity' err = msg_err.format(type_err, msg_dtype if msg_dtype is not None else X.dtype) if (X.ndim in [1, 2] and not np.any(np.equal(X.shape, 0)) and dt in [np.float32, np.float64]): if X.ndim == 1: X = X.reshape((-1, 1)) if dt == np.float64: if not d4p.daal_assert_all_finite(X, allow_nan, 0): raise ValueError(err) elif dt == np.float32: if not d4p.daal_assert_all_finite(X, allow_nan, 1): raise ValueError(err) # First try an O(n) time, O(1) space solution for the common case that # everything is finite; fall back to O(n) space np.isfinite to prevent # false positives from overflow in sum method. The sum is also calculated # safely to reduce dtype induced overflows. elif is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))): pass elif is_float: if (allow_nan and np.isinf(X).any() or not allow_nan and not np.isfinite(X).all()): raise ValueError(err) # for object dtype data, we only check for NaNs (GH-13254) elif X.dtype == np.dtype('object') and not allow_nan: if _object_dtype_isnan(X).any(): raise ValueError("Input contains NaN")