Beispiel #1
0
def test_dask_pairdensity(df, column_properties, column_summary, frequencies):
    pds = []
    for col1, col2 in itertools.combinations(df.columns, 2):
        cp = {k: column_properties[k] for k in [col1, col2]}
        cs = {k: column_summary[k] for k in [col1, col2]}
        fr = {k: frequencies[k] for k in [col1, col2]}
        pd = metrics.pairdensity(df[[col1, col2]], cp, cs, fr)
        if pd is not None:
            if should_pair_density_norm_be_finite(df[[col1, col2]], cp):
                if (
                    not cp[col1][col1]["is_categorical"]
                    and not cp[col2][col2]["is_categorical"]
                    and "poisson" not in col1
                    and "poisson" not in col2
                ):
                    filename = "{}/{}_{}_{}_pd_diff.png".format(
                        test_results_dir, len(df.index), col1, col2
                    )
                    mean_dev = compute_deviation_with_kde(
                        df[[col1, col2]], pd, filename
                    )
                    assert mean_dev < 0.02
                assert (
                    np.sum(pd[col1][col2]["density"]) > 0
                ), "Failed on columns {} - {}".format(col1, col2)

        pds.append(pd)

    joined = _join_dask_results(pds).compute()

    # test serialization
    json.dumps({"pairdensity": joined}, cls=NumpyEncoder)
Beispiel #2
0
def test_dask_outliers(df, column_summary):
    reps = []
    for col in df.columns:
        reps.append(metrics.outliers(df[col], column_summary[col]))

    # test serialization
    joined = _join_dask_results(reps).compute()
    json.dumps({"outliers": joined}, cls=NumpyEncoder)
Beispiel #3
0
def test_dask_column_properties(column_properties):
    # Only worth checking that we determine categorical columns
    # correctly if there are enough rows in the dataframe.
    # There are 13 distinct categories.
    categorical13_props = column_properties["categorical13"]["categorical13"]
    row_threshold = 2 * 13.0 / CAT_FRAC_THRESHOLD
    if categorical13_props["notnulls"] > row_threshold:
        assert categorical13_props["is_categorical"]

    # test serialization
    joined = _join_dask_results(column_properties.values()).compute()
    json.dumps({"column_summary": joined}, cls=NumpyEncoder)
Beispiel #4
0
def test_dask_correlation(df, column_properties):
    cp = _join_dask_results(column_properties.values()).compute()
    rep = metrics.correlation(df, cp)
    cols = rep["_columns"]
    sp = np.array(rep["spearman"])
    order = rep["order"]

    assert len(order) == len(cols)
    assert sp.shape[0] == len(cols)
    assert sp.shape[1] == len(cols)

    # test serialization
    json.dumps({"correlation": rep}, cls=NumpyEncoder)
Beispiel #5
0
def test_dask_correlation(df, column_properties):
    cp = _join_dask_results(column_properties.values()).compute()
    rep = metrics.correlation(df, cp)
    cols = rep['_columns']
    sp = np.array(rep['spearman'])
    order = rep['order']

    assert len(order) == len(cols)
    assert sp.shape[0] == len(cols)
    assert sp.shape[1] == len(cols)

    # test serialization
    json.dumps({'correlation': rep})
Beispiel #6
0
def test_dask_frequencies(df, frequencies):
    for col in frequencies.keys():
        freq_report = frequencies[col]
        if freq_report is None:
            continue
        else:
            freq_report = freq_report[col]

        freqs = df[col].value_counts().to_dict()

        for k in freqs.keys():
            assert freqs[k] == freq_report[k]

    # test serialization
    joined = _join_dask_results(frequencies.values()).compute()
    json.dumps({"freqs": joined}, cls=NumpyEncoder)
Beispiel #7
0
def test_dask_column_summary(df, column_summary):
    for col in df.columns:
        series = df[col]
        cs_report = column_summary[col]

        if cs_report is None or series.isnull().sum() == len(df.index):
            continue

        else:
            cs_report = cs_report[col]

        # Test that only lognormal is set to log transform
        # Only run this test if the column has enough valid
        # values
        if len(df.index) >= 50:
            if col == "lognormal":
                assert cs_report["logtrans"]
            else:
                assert not cs_report["logtrans"]

        _percs = list(cs_report["percentiles"].keys())
        _percs.sort()
        cs_report_perc = [cs_report["percentiles"][p] for p in _percs]
        exact_perc = np.nanpercentile(series, _percs)
        np.testing.assert_allclose(
            cs_report_perc, exact_perc, rtol=1e-3, atol=1e-3
        )

        exact_meanminmax = [
            np.nanmean(series.get_values()),
            np.nanmin(series.get_values()),
            np.nanmax(series.get_values()),
        ]
        rep_meanminmax = [cs_report[x] for x in ["mean", "min", "max"]]
        np.testing.assert_allclose(
            exact_meanminmax, rep_meanminmax, rtol=1e-3, atol=0.01
        )

        # test histogram
        histogram = cs_report["histogram"]

        assert np.sum(histogram["counts"]) == series.notnull().sum()
        if cs_report["n"] > 1 and not np.all(np.mod(series.dropna(), 1) == 0):
            # Bin edges for single-count histograms are not relevant, and
            # integer-only histograms not bounded by extremes in distribution
            assert np.allclose(histogram["bin_edges"][0], series.min())
            assert np.allclose(histogram["bin_edges"][-1], series.max())

        if col == "categoricalint":
            # Check that bins are set correctly for integers
            # we are removing the twos so there should be at least one empty
            # bin in the histogram
            n_unique = series.dropna().unique().size
            assert len(histogram["counts"]) >= n_unique
            assert len(histogram["bin_edges"]) == len(histogram["counts"]) + 1

            # Check that the bin that contains 2 is set to 0
            idx = np.where(np.array(histogram["bin_edges"]) < 2)[0][-1]
            assert histogram["counts"][idx] == 0

            assert np.allclose(
                histogram["bin_edges"][0], series.dropna().min() - 0.5
            )
            assert np.allclose(
                histogram["bin_edges"][-1], series.dropna().max() + 0.5
            )

        # test kde
        kde = cs_report["kde"]

        assert np.all(~np.isnan(kde["x"]))
        assert np.all(~np.isnan(kde["y"]))

        if "categorical" not in col and np.sum(kde["y"]) > 0:
            assert np.allclose(np.trapz(kde["y"], kde["x"]), 1)

        if col == "normal":
            mean = cs_report["mean"]
            kde_max = kde["x"][np.argmax(kde["y"])]
            assert np.allclose(kde_max, mean, atol=5, rtol=0.1)

    # test serialization
    joined = _join_dask_results(column_summary.values()).compute()
    json.dumps({"column_summary": joined}, cls=NumpyEncoder)