def test_score_bad(): A = np.vstack([ np.random.normal(size=100), np.random.normal(loc=-5, size=100), np.random.choice(['f', 'm'], size=100).astype('O') ]).T B = np.vstack([ np.random.normal(size=100), np.random.normal(loc=-5, size=100), np.random.choice(['f', 'm', 'f', 'f'], size=100).astype('O') ]).T assert score([A, B], [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]) < .1 A = np.vstack([ np.random.normal(size=100), np.random.normal(loc=-5, size=100), np.random.choice(['f', 'm'], size=100).astype('O') ]).T B = np.vstack([ np.random.normal(size=100), np.random.normal(loc=5, size=100), np.random.choice(['f', 'm'], size=100).astype('O') ]).T assert score([A, B], [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]) < .1
def test_optimized_split_basic(): X = np.vstack([ np.random.normal(size=100), np.random.normal(loc=-5, size=100), np.random.choice(['f', 'm'], size=100).astype('O') ]).T A, B = optimized_split(X) assert score([A, B], [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]) > .95
def test_optimized_split_df(): X = np.vstack([ np.random.normal(size=100), np.random.normal(loc=-5, size=100), np.random.choice(['f', 'm'], size=100).astype('O') ]).T df = pd.DataFrame(data=X, columns=['a', 'b', 'c']) A, B = optimized_split(df) assert score([A, B], [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]) > .95
def test_score_good(): A = np.vstack([ np.random.normal(size=100), np.random.normal(loc=-5, size=100), np.random.choice(['f', 'm'], size=100).astype('O') ]).T B = A.copy() B[:, 0] += np.random.normal(scale=.1, size=B.shape[0]) B[:, 1] += np.random.normal(scale=.2, size=B.shape[0]) swap_ind = np.random.choice(np.arange(B.shape[0])) B[swap_ind, 2] = 'm' if B[swap_ind, 2] == 'f' else 'f' assert score([A, B], [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]) > .9
def test_optimized_split_callback(): X = np.vstack([ np.random.normal(size=100), np.random.normal(loc=-5, size=100), np.random.choice(['f', 'm'], size=100).astype('O') ]).T scores = [] def callback(partitions, score): scores.append(score) A, B = optimized_split(X, iter_callback=callback) assert len(scores) > 10 and scores[-1] == score([A, B], [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL])
def test_optimized_split_multiple(): np.random.seed(0) # In rare cases X is generated such that it's very difficult to split # to 6 parts in a balanced manner so we fix the seed X = np.vstack([ np.random.normal(size=120), np.random.normal(loc=-5, size=120), np.random.choice(['f', 'm'], size=120).astype('O') ]).T parts = optimized_split(X, n_partitions=6, max_iter=3000, score_threshold=.9) assert score(parts, [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]) >= .9
import numpy as np from balanced_splits.split import optimized_split, score, VarType X = np.vstack([ np.random.normal(size=100), np.random.normal(loc=-5, size=100), np.random.choice(['f', 'm'], size=100).astype('O') ]).T A, B = optimized_split(X) print(score([A, B], [VarType.CONTINUOUS, VarType.CONTINUOUS, VarType.CATEGORICAL]))