Example #1
0
def test_best_split_with_combination():
    """
    Test passing in a perfect split data, with a single catagory merges needed
    """
    arr = np.array(([1] * 5) + ([2] * 10))
    orig_arr = arr.copy()
    ndarr = np.array(([1, 2, 3] * 5) + ([2, 2, 3] * 5) +
                     ([3, 2, 3] * 5)).reshape(15, 3)
    orig_ndarr = ndarr.copy()
    tree = CHAID.Tree(ndarr, arr)

    split = tree.generate_best_split(tree.vectorised_array, tree.observed)

    assert list_ordered_equal(
        ndarr, orig_ndarr
    ), 'Calling chaid should have no side affects for original numpy arrays'
    assert list_ordered_equal(
        arr, orig_arr
    ), 'Calling chaid should have no side affects for original numpy arrays'
    assert split.column_id == 0, 'Identifies correct column to split on'
    assert list_unordered_equal(
        split.split_map, [[1], [2], [3]]), 'Correctly identifies catagories'
    assert list_unordered_equal(split.surrogates,
                                []), 'No surrogates should be generated'
    assert split.p < 0.015
Example #2
0
def test_p_and_chi_values():
    """
    Check chi and p value against hand calculated values
    """
    arr = np.array(([1] * 3) + ([2] * 4))
    ndarr = np.array(([1] * 4) + ([2] * 3)).reshape(7, 1)

    tree = CHAID.Tree(ndarr, arr, split_threshold=0.9)

    split = tree.generate_best_split(tree.vectorised_array, tree.observed)
    assert round(split.chi, 4) == 3.9375
    assert round(split.p, 4) == 0.0472
Example #3
0
 def test_min_child_node_size_does_not_stop_for_unweighted_case(self):
     """
     Check that minumun child node size causes the tree to
     terminate correctly
     """
     tree = CHAID.Tree(self.ndarr,
                       self.arr,
                       alpha_merge=0.999,
                       max_depth=5,
                       min_child_node_size=10)
     tree.build_tree()
     assert len(tree.tree_store) == 4
Example #4
0
    def test_surrogate_default_min_p(self):
        """
        Test that chaid selects min p split
        """
        tree = CHAID.Tree(self.ndarr, self.arr, split_threshold=0.9)

        split = tree.generate_best_split(tree.vectorised_array, tree.observed,
                                         None)

        assert split.p < split.surrogates[
            0].p, 'The best split should be the minimum p by default'
        assert split.chi > split.surrogates[
            0].chi, 'The data picked should not allow picked split to have both p and chi less than the surrogate'
Example #5
0
    def test_surrgate_detection(self):
        """
        Test passing in data, in which a surrogate split exists
        """
        tree = CHAID.Tree(self.ndarr, self.arr, split_threshold=0.9)

        split = tree.generate_best_split(tree.vectorised_array, tree.observed,
                                         None)

        assert split.column_id == 1, 'The best split should be on column 1'
        assert len(split.surrogates
                   ) == 1, 'There should be a single surrogate in given data'
        assert split.surrogates[
            0].column_id == 0, 'The surrogate should be on column 0'
Example #6
0
 def test_incorrect_weighted_counts(self):
     """
     Fix bug wherby the weights was using the class weights
     and not the sliced weights in node()
     """
     tree = CHAID.Tree(self.ndarr,
                       self.arr,
                       alpha_merge=0.999,
                       weights=self.wt,
                       max_depth=5,
                       min_parent_node_size=2)
     tree.build_tree()
     assert tree.tree_store[3].members == {1: 0, 2: 1.2}
     assert tree.tree_store[5].members == {1: 5.0, 2: 6.0}
Example #7
0
def test_new_columns_constructor():
    """
    Test the new tree constructor that takes CHAID Columns as parameters
    """
    orientation = np.array([0,0,1,1,0,0,1,1,0,0,1,2,2,2,2,2,2,2,2,1])
    age = np.array([0,1,1,0,2,2,2,2,1,1,1,0,0,0,0,0,0,0,0,0])
    income = np.array([0,0,1,1,2,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0])
    metadata = {0: '0-5', 1: '6-10', 2: '11-15'}
    cols = [
        CHAID.OrdinalColumn(orientation, name="orientation", metadata=metadata),
        CHAID.OrdinalColumn(age, name="age", metadata=metadata),
    ]
    tree = CHAID.Tree(cols, CHAID.NominalColumn(income), {'min_child_node_size': 1})
    assert tree.tree_store[0].split.groupings == "[['0-5'], ['6-10', '11-15']]"
Example #8
0
def test_correct_dof():
    """
    Check the degrees of freedom is correct
    """
    gender = np.array(
        [0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2])
    income = np.array(
        [0, 0, 1, 0, 2, 0, 1, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

    ndarr = np.transpose(np.vstack([gender]))

    tree = CHAID.Tree(ndarr, income, alpha_merge=0.9)

    split = tree.generate_best_split(tree.vectorised_array, tree.observed)

    assert split.dof == (len(set(gender)) - 1) * (len(set(income)) - 1)
Example #9
0
def test_spliting_identical_values():
    """
    Test that passing in identical data cannot be split
    """
    arr = np.array(([1] * 5) + ([1] * 5))
    orig_arr = arr.copy()
    ndarr = np.array(([1, 2, 3] * 5) + ([2, 2, 3] * 5)).reshape(10, 3)
    orig_ndarr = ndarr.copy()
    tree = CHAID.Tree(ndarr, arr)

    split = tree.generate_best_split(tree.vectorised_array, tree.observed)
    assert list_ordered_equal(ndarr, orig_ndarr), \
        'Calling chaid should have no side affects for original numpy arrays'
    assert list_ordered_equal(arr, orig_arr), \
        'Identifies correct column to split on'
    assert not split.valid(), \
        'Should not be able to split data with no skew'
Example #10
0
def test_p_and_chi_values_when_weighting_applied():
    """
    Check chi and p value when weights supplied
    """
    gender = np.array([0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1])
    income = np.array([0, 0, 1, 0, 2, 0, 1, 2, 1, 0, 1])

    weighting = np.array(
        [0.9, 0.8, 0.9, 1.1, 1.2, 0.8, 1.3, 0.2, 0.5, 0.7, 1.1])
    ndarr = np.transpose(np.vstack([gender]))

    tree = CHAID.Tree(ndarr, income, alpha_merge=0.9, weights=weighting)

    split = tree.generate_best_split(tree.vectorised_array, tree.observed,
                                     weighting)

    assert round(split.chi, 4) == 1.6179
    assert round(split.p, 4) == 0.4453
Example #11
0
def test_zero_subbed_weighted_ndarry():
    """
    Test how the split works when 0 independent categorical variable chooses a dependent categorical variable for the weighted case.
    In this instance, a very small float is assigned to the 0 value
    """
    gender = np.array(
        [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1])
    income = np.array(
        [0, 0, 1, 1, 2, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
    weighting = np.array(([0.9] * int(len(gender) / 2.0)) +
                         ([1.9] * int(len(gender) / 2.0)))

    ndarr = np.transpose(np.vstack([gender]))

    tree = CHAID.Tree(ndarr, income, alpha_merge=0.9, weights=weighting)

    split = tree.generate_best_split(tree.vectorised_array, tree.observed,
                                     weighting)

    assert round(split.chi, 4) == 14.5103
    assert round(split.p, 4) == 0.0007
Example #12
0
 def setUp(self):
     """ Set up for tree generation tests """
     arr = np.array(([1] * 5) + ([2] * 5))
     ndarr = np.array(([1, 2, 3] * 5) + ([2, 2, 3] * 5)).reshape(10, 3)
     self.tree = CHAID.Tree(ndarr, arr)