def test_dask_pairdensity(df, column_properties, column_summary, frequencies): pds = [] for col1, col2 in itertools.combinations(df.columns, 2): cp = {k: column_properties[k] for k in [col1, col2]} cs = {k: column_summary[k] for k in [col1, col2]} fr = {k: frequencies[k] for k in [col1, col2]} pd = metrics.pairdensity(df[[col1, col2]], cp, cs, fr) if pd is not None: if should_pair_density_norm_be_finite(df[[col1, col2]], cp): if ( not cp[col1][col1]["is_categorical"] and not cp[col2][col2]["is_categorical"] and "poisson" not in col1 and "poisson" not in col2 ): filename = "{}/{}_{}_{}_pd_diff.png".format( test_results_dir, len(df.index), col1, col2 ) mean_dev = compute_deviation_with_kde( df[[col1, col2]], pd, filename ) assert mean_dev < 0.02 assert ( np.sum(pd[col1][col2]["density"]) > 0 ), "Failed on columns {} - {}".format(col1, col2) pds.append(pd) joined = _join_dask_results(pds).compute() # test serialization json.dumps({"pairdensity": joined}, cls=NumpyEncoder)
def test_dask_outliers(df, column_summary): reps = [] for col in df.columns: reps.append(metrics.outliers(df[col], column_summary[col])) # test serialization joined = _join_dask_results(reps).compute() json.dumps({"outliers": joined}, cls=NumpyEncoder)
def test_dask_column_properties(column_properties): # Only worth checking that we determine categorical columns # correctly if there are enough rows in the dataframe. # There are 13 distinct categories. categorical13_props = column_properties["categorical13"]["categorical13"] row_threshold = 2 * 13.0 / CAT_FRAC_THRESHOLD if categorical13_props["notnulls"] > row_threshold: assert categorical13_props["is_categorical"] # test serialization joined = _join_dask_results(column_properties.values()).compute() json.dumps({"column_summary": joined}, cls=NumpyEncoder)
def test_dask_correlation(df, column_properties): cp = _join_dask_results(column_properties.values()).compute() rep = metrics.correlation(df, cp) cols = rep["_columns"] sp = np.array(rep["spearman"]) order = rep["order"] assert len(order) == len(cols) assert sp.shape[0] == len(cols) assert sp.shape[1] == len(cols) # test serialization json.dumps({"correlation": rep}, cls=NumpyEncoder)
def test_dask_correlation(df, column_properties): cp = _join_dask_results(column_properties.values()).compute() rep = metrics.correlation(df, cp) cols = rep['_columns'] sp = np.array(rep['spearman']) order = rep['order'] assert len(order) == len(cols) assert sp.shape[0] == len(cols) assert sp.shape[1] == len(cols) # test serialization json.dumps({'correlation': rep})
def test_dask_frequencies(df, frequencies): for col in frequencies.keys(): freq_report = frequencies[col] if freq_report is None: continue else: freq_report = freq_report[col] freqs = df[col].value_counts().to_dict() for k in freqs.keys(): assert freqs[k] == freq_report[k] # test serialization joined = _join_dask_results(frequencies.values()).compute() json.dumps({"freqs": joined}, cls=NumpyEncoder)
def test_dask_column_summary(df, column_summary): for col in df.columns: series = df[col] cs_report = column_summary[col] if cs_report is None or series.isnull().sum() == len(df.index): continue else: cs_report = cs_report[col] # Test that only lognormal is set to log transform # Only run this test if the column has enough valid # values if len(df.index) >= 50: if col == "lognormal": assert cs_report["logtrans"] else: assert not cs_report["logtrans"] _percs = list(cs_report["percentiles"].keys()) _percs.sort() cs_report_perc = [cs_report["percentiles"][p] for p in _percs] exact_perc = np.nanpercentile(series, _percs) np.testing.assert_allclose( cs_report_perc, exact_perc, rtol=1e-3, atol=1e-3 ) exact_meanminmax = [ np.nanmean(series.get_values()), np.nanmin(series.get_values()), np.nanmax(series.get_values()), ] rep_meanminmax = [cs_report[x] for x in ["mean", "min", "max"]] np.testing.assert_allclose( exact_meanminmax, rep_meanminmax, rtol=1e-3, atol=0.01 ) # test histogram histogram = cs_report["histogram"] assert np.sum(histogram["counts"]) == series.notnull().sum() if cs_report["n"] > 1 and not np.all(np.mod(series.dropna(), 1) == 0): # Bin edges for single-count histograms are not relevant, and # integer-only histograms not bounded by extremes in distribution assert np.allclose(histogram["bin_edges"][0], series.min()) assert np.allclose(histogram["bin_edges"][-1], series.max()) if col == "categoricalint": # Check that bins are set correctly for integers # we are removing the twos so there should be at least one empty # bin in the histogram n_unique = series.dropna().unique().size assert len(histogram["counts"]) >= n_unique assert len(histogram["bin_edges"]) == len(histogram["counts"]) + 1 # Check that the bin that contains 2 is set to 0 idx = np.where(np.array(histogram["bin_edges"]) < 2)[0][-1] assert histogram["counts"][idx] == 0 assert np.allclose( histogram["bin_edges"][0], series.dropna().min() - 0.5 ) assert np.allclose( histogram["bin_edges"][-1], series.dropna().max() + 0.5 ) # test kde kde = cs_report["kde"] assert np.all(~np.isnan(kde["x"])) assert np.all(~np.isnan(kde["y"])) if "categorical" not in col and np.sum(kde["y"]) > 0: assert np.allclose(np.trapz(kde["y"], kde["x"]), 1) if col == "normal": mean = cs_report["mean"] kde_max = kde["x"][np.argmax(kde["y"])] assert np.allclose(kde_max, mean, atol=5, rtol=0.1) # test serialization joined = _join_dask_results(column_summary.values()).compute() json.dumps({"column_summary": joined}, cls=NumpyEncoder)