Beispiel #1
0
    def _profile_1d_histogram(self, name, hist):
        is_num = is_numeric(hist)
        is_ts = is_timestamp(hist) or name in self.var_timestamp

        bin_labels = np.array(get_bin_centers(hist)[0])
        bin_counts = np.array([v.entries for v in get_bin_centers(hist)[1]])

        if len(bin_counts) == 0:
            self.logger.warning(f'Histogram "{name}" is empty; skipping.')
            return dict()

        if is_ts:
            to_timestamp = np.vectorize(lambda x: pd.to_datetime(x).value)
            bin_labels = to_timestamp(bin_labels)

        profile = dict()
        profile["filled"] = bin_counts.sum()
        if hasattr(hist, "nanflow"):
            profile["nan"] = hist.nanflow.entries
        elif hasattr(hist, "bins") and "NaN" in hist.bins:
            profile["nan"] = hist.bins["NaN"].entries
        else:
            profile["nan"] = 0
        profile["overflow"] = hist.overflow.entries if hasattr(
            hist, "overflow") else 0
        profile["underflow"] = (hist.underflow.entries if hasattr(
            hist, "underflow") else 0)
        profile["count"] = profile["filled"] + profile["nan"]
        profile["distinct"] = len(np.unique(bin_labels))
        mpv = bin_labels[np.argmax(bin_counts)]  # most probable value
        profile["most_probable_value"] = mpv if not is_ts else pd.Timestamp(
            mpv)

        if is_num and profile["filled"] > 0:
            for f_names, func in self.stats_functions.items():
                names = f_names.split(",")
                results = func(bin_labels, bin_counts)
                if len(names) == 1:
                    results = [results]

                if is_ts:
                    results = [
                        pd.Timedelta(result)
                        if f_name == "std" else pd.Timestamp(result)
                        for f_name, result in zip(name, results)
                    ]

                profile.update({k: v for k, v in zip(names, results)})
        elif not is_num:
            profile["fraction_true"] = pm_np.fraction_of_true(
                bin_labels, bin_counts)

        return profile
Beispiel #2
0
def test_fraction_of_true():
    res = fraction_of_true([], [])
    assert np.isnan(res)
    res = fraction_of_true(["a"], [10])
    assert np.isnan(res)
    res = fraction_of_true(["a", "b", "c"], [10, 10, 10])
    assert np.isnan(res)

    res = fraction_of_true(np.array(["True", "False"]), np.array([0, 0]))
    assert np.isnan(res)
    res = fraction_of_true(np.array(["True", "False"]), np.array([10, 10]))
    assert res == 0.5
    res = fraction_of_true(np.array([True, False]), [10, 10])
    assert res == 0.5

    res = fraction_of_true(np.array(["True"]), np.array([10]))
    assert res == 1.0
    res = fraction_of_true(np.array([True]), np.array([10]))
    assert res == 1.0
    res = fraction_of_true(np.array(["False"]), np.array([10]))
    assert res == 0.0
    res = fraction_of_true(np.array([False]), np.array([10]))
    assert res == 0.0
Beispiel #3
0
def test_fraction_of_true():
    res = fraction_of_true([], [])
    assert np.isnan(res)
    res = fraction_of_true(['a'], [10])
    assert np.isnan(res)
    res = fraction_of_true(['a', 'b', 'c'], [10, 10, 10])
    assert np.isnan(res)

    res = fraction_of_true(np.array(['True', 'False']), np.array([0, 0]))
    assert np.isnan(res)
    res = fraction_of_true(np.array(['True', 'False']), np.array([10, 10]))
    assert res == 0.5
    res = fraction_of_true(np.array([True, False]), [10, 10])
    assert res == 0.5

    res = fraction_of_true(np.array(['True']), np.array([10]))
    assert res == 1.0
    res = fraction_of_true(np.array([True]), np.array([10]))
    assert res == 1.0
    res = fraction_of_true(np.array(['False']), np.array([10]))
    assert res == 0.0
    res = fraction_of_true(np.array([False]), np.array([10]))
    assert res == 0.0