def _profile_1d_histogram(self, name, hist): is_num = is_numeric(hist) is_ts = is_timestamp(hist) or name in self.var_timestamp bin_labels = np.array(get_bin_centers(hist)[0]) bin_counts = np.array([v.entries for v in get_bin_centers(hist)[1]]) if len(bin_counts) == 0: self.logger.warning(f'Histogram "{name}" is empty; skipping.') return dict() if is_ts: to_timestamp = np.vectorize(lambda x: pd.to_datetime(x).value) bin_labels = to_timestamp(bin_labels) profile = dict() profile["filled"] = bin_counts.sum() if hasattr(hist, "nanflow"): profile["nan"] = hist.nanflow.entries elif hasattr(hist, "bins") and "NaN" in hist.bins: profile["nan"] = hist.bins["NaN"].entries else: profile["nan"] = 0 profile["overflow"] = hist.overflow.entries if hasattr( hist, "overflow") else 0 profile["underflow"] = (hist.underflow.entries if hasattr( hist, "underflow") else 0) profile["count"] = profile["filled"] + profile["nan"] profile["distinct"] = len(np.unique(bin_labels)) mpv = bin_labels[np.argmax(bin_counts)] # most probable value profile["most_probable_value"] = mpv if not is_ts else pd.Timestamp( mpv) if is_num and profile["filled"] > 0: for f_names, func in self.stats_functions.items(): names = f_names.split(",") results = func(bin_labels, bin_counts) if len(names) == 1: results = [results] if is_ts: results = [ pd.Timedelta(result) if f_name == "std" else pd.Timestamp(result) for f_name, result in zip(name, results) ] profile.update({k: v for k, v in zip(names, results)}) elif not is_num: profile["fraction_true"] = pm_np.fraction_of_true( bin_labels, bin_counts) return profile
def test_fraction_of_true(): res = fraction_of_true([], []) assert np.isnan(res) res = fraction_of_true(["a"], [10]) assert np.isnan(res) res = fraction_of_true(["a", "b", "c"], [10, 10, 10]) assert np.isnan(res) res = fraction_of_true(np.array(["True", "False"]), np.array([0, 0])) assert np.isnan(res) res = fraction_of_true(np.array(["True", "False"]), np.array([10, 10])) assert res == 0.5 res = fraction_of_true(np.array([True, False]), [10, 10]) assert res == 0.5 res = fraction_of_true(np.array(["True"]), np.array([10])) assert res == 1.0 res = fraction_of_true(np.array([True]), np.array([10])) assert res == 1.0 res = fraction_of_true(np.array(["False"]), np.array([10])) assert res == 0.0 res = fraction_of_true(np.array([False]), np.array([10])) assert res == 0.0
def test_fraction_of_true(): res = fraction_of_true([], []) assert np.isnan(res) res = fraction_of_true(['a'], [10]) assert np.isnan(res) res = fraction_of_true(['a', 'b', 'c'], [10, 10, 10]) assert np.isnan(res) res = fraction_of_true(np.array(['True', 'False']), np.array([0, 0])) assert np.isnan(res) res = fraction_of_true(np.array(['True', 'False']), np.array([10, 10])) assert res == 0.5 res = fraction_of_true(np.array([True, False]), [10, 10]) assert res == 0.5 res = fraction_of_true(np.array(['True']), np.array([10])) assert res == 1.0 res = fraction_of_true(np.array([True]), np.array([10])) assert res == 1.0 res = fraction_of_true(np.array(['False']), np.array([10])) assert res == 0.0 res = fraction_of_true(np.array([False]), np.array([10])) assert res == 0.0