Example #1
0
class TestMasking(unittest.TestCase):

    def setUp(self):
        self.y = np.array([1, 2, 3, 4, 5])
        self.X = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 1, 1, 1], "c": [1, 2, 3, 3, 4]})
        self.X = format_covariate_matrix(self.X)
        self.mask = np.array([True, True, False, False, False])
        self.data = Data(self.X, self.y, self.mask, normalize=False)

    def test_y_sum(self):
        self.assertEqual(self.data.y.summed_y(), 12)

    def test_updating_y_sum(self):
        self.data.update_y(self.y * 2)
        self.assertEqual(self.data.y.summed_y(), 24)

    def test_n_obsv(self):
        self.assertEqual(self.data.X.n_obsv, 3)

    def test_updating_mask(self):
        from bartpy.splitcondition import SplitCondition
        from operator import le
        s = SplitCondition(0, 4, le)
        updated_data = self.data + s

        self.assertListEqual(list(updated_data.mask), [True, True, False, False, True])
        self.assertListEqual(list(updated_data.X.mask), [True, True, False, False, True])
        self.assertListEqual(list(updated_data.y._mask), [True, True, False, False, True])
        self.assertEqual(updated_data.X.n_obsv, 2)
        self.assertEqual(updated_data.X._n_obsv, 2)
        self.assertEqual(updated_data.y.summed_y(), 7)
Example #2
0
 def setUp(self):
     self.y = np.array([1, 2, 3, 4, 5])
     self.X = pd.DataFrame({
         "a": [1, 2, 3, 4, 5],
         "b": [1, 1, 1, 1, 1],
         "c": [1, 2, 3, 3, 4]
     })
     self.X = format_covariate_matrix(self.X)
     self.data = Data(self.X, self.y, normalize=True)
Example #3
0
 def __init__(self, data: Data, split_conditions: List[SplitCondition]=None, combined_condition=None):
     if split_conditions is None:
         split_conditions = []
     self._data = Data(data.X, deepcopy(data.y), cache=False, unique_columns=data.unique_columns)
     self._conditions = split_conditions
     self._combined_condition = combined_condition
     self._conditioned_X = self._data.X[self.condition()]
     self._conditioned_data = Data(self._conditioned_X, self._data._y[self.condition()], unique_columns=data.unique_columns)
     self._combined_conditioner = None
Example #4
0
 def setUp(self):
     self.data = Data(pd.DataFrame({"a": [1, 2]}).values, np.array([1, 1]))
     self.d = LeafNode(Split(self.data))
     self.e = LeafNode(Split(self.data))
     self.c = DecisionNode(Split(self.data), self.d, self.e)
     self.b = LeafNode(Split(self.data))
     self.a = DecisionNode(Split(self.data), self.b, self.c)
     self.tree = Tree([self.a, self.b, self.c, self.d, self.e])
Example #5
0
class TestData(unittest.TestCase):

    def setUp(self):
        self.y = np.array([1, 2, 3, 4, 5])
        self.X = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 1, 1, 1], "c": [1, 2, 3, 3, 4]})
        self.data = Data(self.X, self.y, normalize=True)

    def test_unnormalization(self):
        self.assertListEqual(list(self.data.unnormalized_y), list(self.y))
        self.assertListEqual(list(self.data.unnormalize_y(np.array([0, 0.25, 0.5, 0.75]))), [3, 4, 5, 6])

    def test_unique_proportion_of_value_in_variable(self):
        self.assertEqual(self.data.proportion_of_value_in_variable(0, 1), 0.2)

    def test_non_unique_proportion_of_value_in_variable(self):
        self.assertEqual(self.data.proportion_of_value_in_variable(2, 1), 0.2)
        self.assertEqual(self.data.proportion_of_value_in_variable(2, 3), 0.4)

    def test_unique_columns(self):
        self.assertEqual(self.data.unique_columns, [0])

    def test_covariates_stored_as_matrix(self):
        self.assertEqual(type(self.data.X), np.ndarray)

    def test_is_not_constant(self):
        self.assertTrue(is_not_constant(np.array([1, 1, 2, 3])))
        self.assertFalse(is_not_constant(np.array([1, 1, 1, 1])))

    def test_n_obsv(self):
        self.assertEqual(self.data.n_obsv, 5)

    def test_normalization(self):
        self.assertEqual(-0.5, self.data.y.min())
        self.assertEqual(0.5, self.data.y.max())

    def test_splittable_variables(self):
        self.assertListEqual(list(self.data.splittable_variables()), [0, 2])

    def test_random_splittable_value(self):
        for a in range(10000):
            self.assertIn(self.data.random_splittable_value(0), [1, 2, 3, 4])
        self.assertIsNone(self.data.random_splittable_value(1))

    def test_random_splittable_variable(self):
        for a in range(100):
            self.assertIn(self.data.random_splittable_variable(), [0, 2])
        self.filtered_data = Data(self.data.X[:,[1]], self.data.y)
        with self.assertRaises(NoSplittableVariableException):
            self.filtered_data.random_splittable_variable()

    def test_n_splittable_variables(self):
        self.assertEqual(self.data.n_splittable_variables, 2)

    def test_variables(self):
        self.assertEqual(self.data.variables, [0, 1, 2])
Example #6
0
 def test_single_condition_data(self):
     data = Data(pd.DataFrame({"a": [1, 2]}).values, np.array([1, 2]))
     left_condition, right_condition = SplitCondition(0, 1,
                                                      le), SplitCondition(
                                                          0, 1, gt)
     left_split, right_split = Split(data) + left_condition, Split(
         data) + right_condition
     self.assertListEqual([1], list(left_split.data.X[:, 0]))
     self.assertListEqual([2], list(right_split.data.X[:, 0]))
Example #7
0
    def _convert_covariates_to_data(self, X: Union[np.ndarray, pd.DataFrame], y: np.ndarray) -> Data:
        from copy import deepcopy
        if type(X) == pd.DataFrame:
            self.columns = X.columns
            X = X.values
        else:
            self.columns = list(map(str, range(X.shape[1])))

        return Data(deepcopy(X), deepcopy(y), normalize=True)
Example #8
0
 def setUp(self):
     self.data = Data(format_covariate_matrix(pd.DataFrame({"a": [1]})),
                      np.array([1]).astype(float))
     self.d = LeafNode(Split(self.data), None)
     self.e = LeafNode(Split(self.data), None)
     self.c = DecisionNode(Split(self.data), self.d, self.e)
     self.b = LeafNode(Split(self.data))
     self.a = DecisionNode(Split(self.data), self.b, self.c)
     self.tree = Tree([self.a, self.b, self.c, self.d, self.e])
Example #9
0
 def _convert_covariates_to_data(X: np.ndarray, y: np.ndarray) -> Data:
     from copy import deepcopy
     if type(X) == pd.DataFrame:
         X: pd.DataFrame = X
         X = X.values
     return Data(deepcopy(X),
                 deepcopy(y),
                 mask=np.zeros_like(X).astype(bool),
                 normalize=True)
Example #10
0
 def setUp(self):
     self.X = format_covariate_matrix(pd.DataFrame({"a": [1, 2, 3, 4, 5]}))
     self.raw_y = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
     self.data = Data(format_covariate_matrix(self.X),
                      self.raw_y,
                      normalize=True)
     normalizing_scale = self.data.y.normalizing_scale
     self.model = Model(self.data,
                        Sigma(0.001,
                              0.001,
                              scaling_factor=normalizing_scale),
                        n_trees=2,
                        initializer=None)
     self.model.initialize_trees()
Example #11
0
    def test_combined_condition_data(self):
        data = Data(
            pd.DataFrame({
                "a": [1, 2, 3, 4]
            }).values, np.array([1, 2, 1, 1]))

        first_left_condition, first_right_condition = SplitCondition(
            0, 3, le), SplitCondition(0, 3, gt)
        second_left_condition, second_right_condition = SplitCondition(
            0, 1, le), SplitCondition(0, 1, gt)

        split = Split(data)
        updated_split = split + first_left_condition + second_right_condition
        conditioned_data = updated_split.data
        self.assertListEqual([2, 3], list(conditioned_data.X[:, 0]))
Example #12
0
    def test_most_recent_split(self):
        data = Data(
            pd.DataFrame({
                "a": [1, 2, 3, 4]
            }).values, np.array([1, 2, 1, 1]))

        first_left_condition, first_right_condition = SplitCondition(
            0, 3, le), SplitCondition(0, 3, gt)
        second_left_condition, second_right_condition = SplitCondition(
            0, 1, le), SplitCondition(0, 1, gt)

        split = Split(data)
        updated_split = split + first_left_condition + second_right_condition
        self.assertEqual(
            (split + first_left_condition).most_recent_split_condition(),
            first_left_condition)
        self.assertEqual(updated_split.most_recent_split_condition(),
                         second_right_condition)
Example #13
0
class TestDataNormalization(unittest.TestCase):
    def setUp(self):
        self.y = np.array([1, 2, 3, 4, 5])
        self.X = pd.DataFrame({
            "a": [1, 2, 3, 4, 5],
            "b": [1, 1, 1, 1, 1],
            "c": [1, 2, 3, 3, 4]
        })
        self.X = format_covariate_matrix(self.X)
        self.data = Data(self.X, self.y, normalize=True)

    def test_unnormalization(self):
        self.assertListEqual(list(self.data.unnormalized_y), list(self.y))
        self.assertListEqual(
            list(self.data.unnormalize_y(np.array([0, 0.25, 0.5, 0.75]))),
            [3, 4, 5, 6])

    def test_normalization(self):
        self.assertEqual(-0.5, self.data.y.min())
        self.assertEqual(0.5, self.data.y.max())
Example #14
0
    def setUp(self):
        self.data = Data(
            pd.DataFrame({
                "a": [1, 2, 3],
                "b": [1, 2, 3]
            }).values, np.array([1, 2, 3]))

        self.a = split_node(LeafNode(Split(
            self.data)), (SplitCondition(0, 1, le), SplitCondition(0, 1, gt)))
        self.b = self.a.left_child
        self.x = self.a.right_child
        self.tree = Tree([self.a, self.b, self.x])

        self.c = split_node(
            self.a._right_child,
            (SplitCondition(1, 2, le), SplitCondition(1, 2, gt)))
        mutate(self.tree, TreeMutation("grow", self.x, self.c))

        self.d = self.c.left_child
        self.e = self.c.right_child
Example #15
0
    def setUp(self):
        X = format_covariate_matrix(
            pd.DataFrame({
                "a": [1, 2, 3],
                "b": [1, 2, 3]
            }))
        self.data = Data(X, np.array([1, 2, 3]).astype(float))

        self.a = split_node(LeafNode(Split(
            self.data)), (SplitCondition(0, 1, le), SplitCondition(0, 1, gt)))
        self.b = self.a.left_child
        self.x = self.a.right_child
        self.tree = Tree([self.a, self.b, self.x])

        self.c = split_node(
            self.a._right_child,
            (SplitCondition(1, 2, le), SplitCondition(1, 2, gt)))
        mutate(self.tree, TreeMutation("grow", self.x, self.c))

        self.d = self.c.left_child
        self.e = self.c.right_child
Example #16
0
class TestDataCaching(unittest.TestCase):
    def setUp(self):
        self.y = np.array([1, 2, 3, 4, 5])
        self.X = pd.DataFrame({
            "a": [1, 2, 3, 4, 5],
            "b": [1, 1, 1, 1, 1],
            "c": [1, 2, 3, 3, 4]
        })
        self.X = format_covariate_matrix(self.X)
        self.data = Data(self.X, self.y, normalize=False)

    def test_summed_y(self):
        self.assertEqual(self.data.summed_y(), np.sum(self.y))
        self.data.update_y(np.array(self.y * 2))
        self.assertEqual(self.data.summed_y(), np.sum(self.y) * 2)

    def test_y(self):
        self.assertListEqual(list(self.data.y.data), list(self.y))
        updated_y = np.array(self.y * 2)
        self.data.update_y(updated_y)
        self.assertListEqual(list(self.data.y.data), list(updated_y))
Example #17
0
 def out_of_sample_condition(self, X: np.ndarray):
     data = Data(X, np.array([0] * len(X)), cache=False)
     return self.out_of_sample_conditioner().condition(X)
Example #18
0
 def test_random_splittable_variable(self):
     for a in range(100):
         self.assertIn(self.data.random_splittable_variable(), [0, 2])
     self.filtered_data = Data(self.data.X[:,[1]], self.data.y)
     with self.assertRaises(NoSplittableVariableException):
         self.filtered_data.random_splittable_variable()
Example #19
0
 def setUp(self):
     self.X = format_covariate_matrix(pd.DataFrame({"a": [1, 2, 3, 4, 5]}))
     self.data = Data(format_covariate_matrix(self.X),
                      np.array([1.0, 2.0, 3.0, 4.0, 5.0]))
     self.split = Split(self.data)
     self.node = LeafNode(self.split)
Example #20
0
 def setUp(self):
     self.X = format_covariate_matrix(pd.DataFrame({"a": [1]}))
     self.data = Data(format_covariate_matrix(self.X), np.array([1.0]))
Example #21
0
 def setUp(self):
     self.data = Data(pd.DataFrame({"a": [1]}).values, np.array([1]))
Example #22
0
 def test_null_split_returns_all_values(self):
     data = Data(pd.DataFrame({"a": [1, 2]}).values, np.array([1, 2]))
     split = Split(data)
     conditioned_data = split.data
     self.assertListEqual(list(data.X[:, 0]), list(conditioned_data.X[:,
                                                                      0]))