def test_score_bad():
    A = np.vstack([
        np.random.normal(size=100),
        np.random.normal(loc=-5, size=100),
        np.random.choice(['f', 'm'], size=100).astype('O')
    ]).T
    B = np.vstack([
        np.random.normal(size=100),
        np.random.normal(loc=-5, size=100),
        np.random.choice(['f', 'm', 'f', 'f'], size=100).astype('O')
    ]).T

    assert score([A, B],
                [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]) < .1

    A = np.vstack([
        np.random.normal(size=100),
        np.random.normal(loc=-5, size=100),
        np.random.choice(['f', 'm'], size=100).astype('O')
    ]).T
    B = np.vstack([
        np.random.normal(size=100),
        np.random.normal(loc=5, size=100),
        np.random.choice(['f', 'm'], size=100).astype('O')
    ]).T

    assert score([A, B],
                [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]) < .1
def test_optimized_split_basic():
    X = np.vstack([
        np.random.normal(size=100),
        np.random.normal(loc=-5, size=100),
        np.random.choice(['f', 'm'], size=100).astype('O')
    ]).T

    A, B = optimized_split(X)

    assert score([A, B],
                [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]) > .95
def test_optimized_split_df():
    X = np.vstack([
        np.random.normal(size=100),
        np.random.normal(loc=-5, size=100),
        np.random.choice(['f', 'm'], size=100).astype('O')
    ]).T

    df = pd.DataFrame(data=X, columns=['a', 'b', 'c'])

    A, B = optimized_split(df)

    assert score([A, B],
                [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]) > .95
def test_score_good():
    A = np.vstack([
        np.random.normal(size=100),
        np.random.normal(loc=-5, size=100),
        np.random.choice(['f', 'm'], size=100).astype('O')
    ]).T
    B = A.copy()
    B[:, 0] += np.random.normal(scale=.1, size=B.shape[0])
    B[:, 1] += np.random.normal(scale=.2, size=B.shape[0])
    swap_ind = np.random.choice(np.arange(B.shape[0]))
    B[swap_ind, 2] = 'm' if B[swap_ind, 2] == 'f' else 'f'

    assert score([A, B],
                [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]) > .9
def test_optimized_split_callback():
    X = np.vstack([
        np.random.normal(size=100),
        np.random.normal(loc=-5, size=100),
        np.random.choice(['f', 'm'], size=100).astype('O')
    ]).T

    scores = []
    def callback(partitions, score):
        scores.append(score)

    A, B = optimized_split(X, iter_callback=callback)

    assert len(scores) > 10 and scores[-1] == score([A, B],
                                                    [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL])
def test_optimized_split_multiple():
    np.random.seed(0)
    # In rare cases X is generated such that it's very difficult to split
    # to 6 parts in a balanced manner so we fix the seed

    X = np.vstack([
        np.random.normal(size=120),
        np.random.normal(loc=-5, size=120),
        np.random.choice(['f', 'm'], size=120).astype('O')
    ]).T

    parts = optimized_split(X, n_partitions=6, max_iter=3000, score_threshold=.9)

    assert score(parts,
                [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]) >= .9
Example #7
0
import numpy as np
from balanced_splits.split import optimized_split, score, VarType

X = np.vstack([
    np.random.normal(size=100),
    np.random.normal(loc=-5, size=100),
    np.random.choice(['f', 'm'], size=100).astype('O')
]).T

A, B = optimized_split(X)

print(score([A, B],
            [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]))