def test_two_leaves(): np.random.seed(999) df = pd.DataFrame([ # x1, x2, y stratify x1, consider y ~ x2 [1, 3, 5], [2, 4, 6], [1, 4, 5], [2, 2, 4] ], columns=['x1','x2','y']) X = df.drop('y', axis=1) y = df['y'] leaves = get_leaves(X, y, 'x2') # get index of samples in each leaf expected_leaves = [np.array([0, 2]), # leaf 0 np.array([1, 3])] # leaf 1 np.testing.assert_array_equal(leaves, expected_leaves) leaf_deltas, leaf_counts, refcats, ignored = stratify_cats(X,y,colname="x2", min_samples_leaf=2) expected_leaf_deltas = np.array([[nan, nan], # 0 [nan, nan], # 1 [nan, 0], # 2 [0, nan], # 3 [0, 2]]) # 4 expected_leaf_counts = np.array([[0, 0], [0, 0], [0, 1], [1, 0], [1, 1]]) expected_refcats = np.array([4, 2]) np.testing.assert_array_almost_equal(leaf_deltas, expected_leaf_deltas, decimal=1) np.testing.assert_array_equal(leaf_counts, expected_leaf_counts) np.testing.assert_array_equal(refcats, expected_refcats) assert ignored==0
def test_single_leaf(): np.random.seed(999) df = pd.DataFrame([ # x1, x2, y stratify x1, consider y ~ x2 [1, 3, 5], [1, 4, 6], [1, 4, 5], [1, 2, 4] ], columns=['x1','x2','y']) X = df.drop('y', axis=1) y = df['y'] leaves = get_leaves(X, y, 'x2') # get index of samples in each leaf expected_leaves = [np.array([0, 1, 2, 3])] # leaf 0 np.testing.assert_array_equal(leaves, expected_leaves) leaf_deltas, leaf_counts, refcats, ignored = stratify_cats(X,y,colname="x2", min_samples_leaf=4) expected_leaf_deltas = np.array([nan, nan, -1, 0, .5]).reshape(-1,1) expected_leaf_counts = np.array([0, 0, 1, 1, 2]).reshape(-1,1) expected_refcats = np.array([3]) np.testing.assert_array_almost_equal(leaf_deltas, expected_leaf_deltas, decimal=1) np.testing.assert_array_equal(leaf_counts, expected_leaf_counts) np.testing.assert_array_equal(refcats, expected_refcats) assert ignored==0
def test_three_leaves_no_overlap(): np.random.seed(999) df = pd.DataFrame( [ # x1, x2, y stratify x1, consider y ~ x2 [1, 2, 9], [1, 3, 7], [3, 4, 6], [3, 5, 5], [4, 6, 4], [4, 7, 3] ], columns=['x1', 'x2', 'y']) X = df.drop('y', axis=1) y = df['y'] leaves = get_leaves( X, y, 'x2', min_samples_leaf=2) # get index of samples in each leaf expected_leaves = [ np.array([0, 1]), # leaf 0 np.array([2, 3]), # leaf 1 np.array([4, 5]) ] # leaf 2 np.testing.assert_array_equal(leaves, expected_leaves) leaf_deltas, leaf_counts, ignored = stratify_cats(X, y, colname="x2", min_samples_leaf=2) print(leaf_deltas, leaf_counts) expected_leaf_deltas = np.array([ [nan, nan, nan], # cat 0 [nan, nan, nan], # cat 2 [2, nan, nan], # cat 3 [0, nan, nan], # cat 4 [nan, 1, nan], [nan, 0, nan], [nan, nan, 1], [nan, nan, 0] ]) expected_leaf_counts = np.array([[0, 0, 0], [0, 0, 0], [1, 0, 0], [1, 0, 0], [0, 1, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1]]) np.testing.assert_array_almost_equal(leaf_deltas, expected_leaf_deltas, decimal=1) np.testing.assert_array_equal(leaf_counts, expected_leaf_counts) assert ignored == 0
def test_two_leaves_with_2nd_ignored(): np.random.seed(999) df = pd.DataFrame( [ # x1, x2, y stratify x1, consider y ~ x2 [1, 3, 5], [1, 4, 6], [2, 4, 7], [2, 4, 8] ], columns=['x1', 'x2', 'y']) X = df.drop('y', axis=1) y = df['y'] """ Second leaf is indexes 2,3 of x2, which has same x value. must ignore so there is one leaf, with cats 3 and 4: leaf_deltas [[nan] [nan] [nan] [ 0.] [ 1.]] """ leaves = get_leaves(X, y, 'x2') # get index of samples in each leaf expected_leaves = [ np.array([0, 1]), # leaf 0 np.array([2, 3]) ] # leaf 1 np.testing.assert_array_equal(leaves, expected_leaves) leaf_deltas, leaf_counts, ignored = stratify_cats(X, y, colname="x2", min_samples_leaf=2) expected_leaf_deltas = np.array([[nan, nan], [nan, nan], [nan, nan], [0, nan], [1, 0]]) expected_leaf_counts = np.array([[0, 0], [0, 0], [0, 0], [1, 0], [1, 2]]) np.testing.assert_array_almost_equal(leaf_deltas, expected_leaf_deltas, decimal=1) np.testing.assert_array_equal(leaf_counts, expected_leaf_counts) assert ignored == 0
def speed_ModelID(): "I believe none of this is in the JIT path; repeated runs are same speed" np.random.seed(1) n = 20_000 min_samples_leaf = 5 X, y = load_bulldozer(n=n) leaf_deltas, leaf_counts, ignored = \ stratify_cats(X,y,colname="ModelID",min_samples_leaf=min_samples_leaf) start = timer() avg_values_at_cat(leaf_deltas, leaf_counts, max_iter=10) stop = timer() nunique = len(np.unique(X['ModelID'])) print( f"n={n}, unique cats {nunique}, min_samples_leaf={min_samples_leaf}: avg_values_at_cat {stop - start:.3f}s" )