Beispiel #1
0
def test_distribution_statistics_autodist_base():
    """
    Test.
    """
    nr_features = 2
    size = 1000
    np.random.seed(0)
    df1 = pd.DataFrame(np.random.normal(size=(size, nr_features)), columns=[f"feat_{x}" for x in range(nr_features)])
    df2 = pd.DataFrame(np.random.normal(size=(size, nr_features)), columns=[f"feat_{x}" for x in range(nr_features)])
    features = df1.columns
    myAutoDist = AutoDist(statistical_tests="all", binning_strategies="all", bin_count=[10, 20])
    assert repr(myAutoDist).startswith("AutoDist")
    assert not myAutoDist.fitted
    res = myAutoDist.compute(df1, df2, column_names=features)
    assert myAutoDist.fitted
    pd.testing.assert_frame_equal(res, myAutoDist.result)
    assert isinstance(res, pd.DataFrame)
    assert res["column"].values.tolist() == features.to_list()

    dist = DistributionStatistics(statistical_test="ks", binning_strategy="simplebucketer", bin_count=10)
    dist.compute(df1["feat_0"], df2["feat_0"])
    assert dist.p_value == res.loc[res["column"] == "feat_0", "p_value_KS_simplebucketer_10"][0]
    assert dist.statistic == res.loc[res["column"] == "feat_0", "statistic_KS_simplebucketer_10"][0]

    dist = DistributionStatistics(statistical_test="ks", binning_strategy=None, bin_count=10)
    dist.compute(df1["feat_0"], df2["feat_0"])
    assert dist.p_value == res.loc[res["column"] == "feat_0", "p_value_KS_no_bucketing_0"][0]
    assert dist.statistic == res.loc[res["column"] == "feat_0", "statistic_KS_no_bucketing_0"][0]
def test_distribution_statistics_attributes_ks():
    d1 = np.histogram(np.random.normal(size=1000), 10)[0]
    d2 = np.histogram(np.random.normal(size=1000), 10)[0]
    myTest = DistributionStatistics('ks', binning_strategy=None)
    _ = myTest.compute(d1, d2, verbose=False)
    ks_value, p_value = ks(d1, d2)
    assert myTest.statistic == ks_value
def test_distribution_statistics_ks_no_binning():
    d1 = np.histogram(np.random.normal(size=1000), 10)[0]
    d2 = np.histogram(np.random.weibull(1, size=1000) - 1, 10)[0]
    myTest = DistributionStatistics('ks', binning_strategy=None)
    assert not myTest.fitted
    res = myTest.compute(d1, d2)
    assert myTest.fitted
    assert isinstance(res, tuple)
def test_distribution_statistics_tuple_output():
    d1 = np.histogram(np.random.normal(size=1000), 10)[0]
    d2 = np.histogram(np.random.weibull(1, size=1000) - 1, 10)[0]
    myTest = DistributionStatistics('ks', 'SimpleBucketer', bin_count=10)
    assert not myTest.fitted
    res = myTest.compute(d1, d2)
    assert myTest.fitted
    assert isinstance(res, tuple)
def test_distribution_statistics_psi():
    d1 = np.histogram(np.random.normal(size=1000), 10)[0]
    d2 = np.histogram(np.random.weibull(1, size=1000) - 1, 10)[0]
    myTest = DistributionStatistics('psi', 'SimpleBucketer', bin_count=10)
    assert not myTest.fitted
    psi_test, p_value_test = myTest.compute(d1, d2)
    assert myTest.fitted
    assert isinstance(psi_test, numbers.Number)
def test_distribution_statistics_attributes_psi():
    a = np.random.normal(size=1000)
    b = np.random.normal(size=1000)
    d1 = np.histogram(a, 10)[0]
    d2 = np.histogram(b, 10)[0]
    myTest = DistributionStatistics('psi', binning_strategy=None)
    _ = myTest.compute(d1, d2, verbose=False)
    psi_value_test, p_value_test = psi(d1, d2, verbose=False)
    assert myTest.statistic == psi_value_test
Beispiel #7
0
def test_distribution_statistics_autodist_base():
    '''DistributionStatiistics autodist base'''
    nr_features = 2
    size = 1000
    np.random.seed(0)
    df1 = pd.DataFrame(np.random.normal(size=(size, nr_features)),
                       columns=[f'feat_{x}' for x in range(nr_features)])
    df2 = pd.DataFrame(np.random.normal(size=(size, nr_features)),
                       columns=[f'feat_{x}' for x in range(nr_features)])
    features = df1.columns
    myAutoDist = AutoDist(statistical_tests='all',
                          binning_strategies='all',
                          bin_count=[10, 20])
    assert repr(myAutoDist).startswith('AutoDist')
    assert not myAutoDist.fitted
    res = myAutoDist.compute(df1, df2, column_names=features)
    assert myAutoDist.fitted
    pd.testing.assert_frame_equal(res, myAutoDist.result)
    assert isinstance(res, pd.DataFrame)
    assert res['column'].values.tolist() == features.to_list()

    dist = DistributionStatistics(statistical_test='ks',
                                  binning_strategy='simplebucketer',
                                  bin_count=10)
    dist.compute(df1['feat_0'], df2['feat_0'])
    assert dist.p_value == res.loc[res['column'] == 'feat_0',
                                   'p_value_KS_simplebucketer_10'][0]
    assert dist.statistic == res.loc[res['column'] == 'feat_0',
                                     'statistic_KS_simplebucketer_10'][0]

    dist = DistributionStatistics(statistical_test='ks',
                                  binning_strategy=None,
                                  bin_count=10)
    dist.compute(df1['feat_0'], df2['feat_0'])
    assert dist.p_value == res.loc[res['column'] == 'feat_0',
                                   'p_value_KS_no_bucketing_0'][0]
    assert dist.statistic == res.loc[res['column'] == 'feat_0',
                                     'statistic_KS_no_bucketing_0'][0]