def test_best_split_with_combination(): """ Test passing in a perfect split data, with a single catagory merges needed """ arr = np.array(([1] * 5) + ([2] * 10)) orig_arr = arr.copy() ndarr = np.array(([1, 2, 3] * 5) + ([2, 2, 3] * 5) + ([3, 2, 3] * 5)).reshape(15, 3) orig_ndarr = ndarr.copy() tree = CHAID.Tree(ndarr, arr) split = tree.generate_best_split(tree.vectorised_array, tree.observed) assert list_ordered_equal( ndarr, orig_ndarr ), 'Calling chaid should have no side affects for original numpy arrays' assert list_ordered_equal( arr, orig_arr ), 'Calling chaid should have no side affects for original numpy arrays' assert split.column_id == 0, 'Identifies correct column to split on' assert list_unordered_equal( split.split_map, [[1], [2], [3]]), 'Correctly identifies catagories' assert list_unordered_equal(split.surrogates, []), 'No surrogates should be generated' assert split.p < 0.015
def test_p_and_chi_values(): """ Check chi and p value against hand calculated values """ arr = np.array(([1] * 3) + ([2] * 4)) ndarr = np.array(([1] * 4) + ([2] * 3)).reshape(7, 1) tree = CHAID.Tree(ndarr, arr, split_threshold=0.9) split = tree.generate_best_split(tree.vectorised_array, tree.observed) assert round(split.chi, 4) == 3.9375 assert round(split.p, 4) == 0.0472
def test_min_child_node_size_does_not_stop_for_unweighted_case(self): """ Check that minumun child node size causes the tree to terminate correctly """ tree = CHAID.Tree(self.ndarr, self.arr, alpha_merge=0.999, max_depth=5, min_child_node_size=10) tree.build_tree() assert len(tree.tree_store) == 4
def test_surrogate_default_min_p(self): """ Test that chaid selects min p split """ tree = CHAID.Tree(self.ndarr, self.arr, split_threshold=0.9) split = tree.generate_best_split(tree.vectorised_array, tree.observed, None) assert split.p < split.surrogates[ 0].p, 'The best split should be the minimum p by default' assert split.chi > split.surrogates[ 0].chi, 'The data picked should not allow picked split to have both p and chi less than the surrogate'
def test_surrgate_detection(self): """ Test passing in data, in which a surrogate split exists """ tree = CHAID.Tree(self.ndarr, self.arr, split_threshold=0.9) split = tree.generate_best_split(tree.vectorised_array, tree.observed, None) assert split.column_id == 1, 'The best split should be on column 1' assert len(split.surrogates ) == 1, 'There should be a single surrogate in given data' assert split.surrogates[ 0].column_id == 0, 'The surrogate should be on column 0'
def test_incorrect_weighted_counts(self): """ Fix bug wherby the weights was using the class weights and not the sliced weights in node() """ tree = CHAID.Tree(self.ndarr, self.arr, alpha_merge=0.999, weights=self.wt, max_depth=5, min_parent_node_size=2) tree.build_tree() assert tree.tree_store[3].members == {1: 0, 2: 1.2} assert tree.tree_store[5].members == {1: 5.0, 2: 6.0}
def test_new_columns_constructor(): """ Test the new tree constructor that takes CHAID Columns as parameters """ orientation = np.array([0,0,1,1,0,0,1,1,0,0,1,2,2,2,2,2,2,2,2,1]) age = np.array([0,1,1,0,2,2,2,2,1,1,1,0,0,0,0,0,0,0,0,0]) income = np.array([0,0,1,1,2,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0]) metadata = {0: '0-5', 1: '6-10', 2: '11-15'} cols = [ CHAID.OrdinalColumn(orientation, name="orientation", metadata=metadata), CHAID.OrdinalColumn(age, name="age", metadata=metadata), ] tree = CHAID.Tree(cols, CHAID.NominalColumn(income), {'min_child_node_size': 1}) assert tree.tree_store[0].split.groupings == "[['0-5'], ['6-10', '11-15']]"
def test_correct_dof(): """ Check the degrees of freedom is correct """ gender = np.array( [0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2]) income = np.array( [0, 0, 1, 0, 2, 0, 1, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]) ndarr = np.transpose(np.vstack([gender])) tree = CHAID.Tree(ndarr, income, alpha_merge=0.9) split = tree.generate_best_split(tree.vectorised_array, tree.observed) assert split.dof == (len(set(gender)) - 1) * (len(set(income)) - 1)
def test_spliting_identical_values(): """ Test that passing in identical data cannot be split """ arr = np.array(([1] * 5) + ([1] * 5)) orig_arr = arr.copy() ndarr = np.array(([1, 2, 3] * 5) + ([2, 2, 3] * 5)).reshape(10, 3) orig_ndarr = ndarr.copy() tree = CHAID.Tree(ndarr, arr) split = tree.generate_best_split(tree.vectorised_array, tree.observed) assert list_ordered_equal(ndarr, orig_ndarr), \ 'Calling chaid should have no side affects for original numpy arrays' assert list_ordered_equal(arr, orig_arr), \ 'Identifies correct column to split on' assert not split.valid(), \ 'Should not be able to split data with no skew'
def test_p_and_chi_values_when_weighting_applied(): """ Check chi and p value when weights supplied """ gender = np.array([0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1]) income = np.array([0, 0, 1, 0, 2, 0, 1, 2, 1, 0, 1]) weighting = np.array( [0.9, 0.8, 0.9, 1.1, 1.2, 0.8, 1.3, 0.2, 0.5, 0.7, 1.1]) ndarr = np.transpose(np.vstack([gender])) tree = CHAID.Tree(ndarr, income, alpha_merge=0.9, weights=weighting) split = tree.generate_best_split(tree.vectorised_array, tree.observed, weighting) assert round(split.chi, 4) == 1.6179 assert round(split.p, 4) == 0.4453
def test_zero_subbed_weighted_ndarry(): """ Test how the split works when 0 independent categorical variable chooses a dependent categorical variable for the weighted case. In this instance, a very small float is assigned to the 0 value """ gender = np.array( [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1]) income = np.array( [0, 0, 1, 1, 2, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) weighting = np.array(([0.9] * int(len(gender) / 2.0)) + ([1.9] * int(len(gender) / 2.0))) ndarr = np.transpose(np.vstack([gender])) tree = CHAID.Tree(ndarr, income, alpha_merge=0.9, weights=weighting) split = tree.generate_best_split(tree.vectorised_array, tree.observed, weighting) assert round(split.chi, 4) == 14.5103 assert round(split.p, 4) == 0.0007
def setUp(self): """ Set up for tree generation tests """ arr = np.array(([1] * 5) + ([2] * 5)) ndarr = np.array(([1, 2, 3] * 5) + ([2, 2, 3] * 5)).reshape(10, 3) self.tree = CHAID.Tree(ndarr, arr)