def make_splits(self, stamp_token, next_stamp_token, class_id): """Create the best split using the accumulated stats and flush the state.""" # Get the aggregated gradients and hessians per <partition_id, feature_id> # pair. num_minibatches, partition_ids, feature_ids, gradients, hessians = ( self._stats_accumulator.flush(stamp_token, next_stamp_token)) # For sum_reduction, we don't need to divide by number of minibatches. num_minibatches = control_flow_ops.cond( ops.convert_to_tensor(self._loss_uses_sum_reduction), lambda: math_ops.to_int64(1), lambda: num_minibatches) partition_ids, gains, split_infos = ( split_handler_ops.build_categorical_equality_splits( num_minibatches=num_minibatches, partition_ids=partition_ids, feature_ids=feature_ids, gradients=gradients, hessians=hessians, class_id=class_id, feature_column_group_id=self._feature_column_group_id, l1_regularization=self._l1_regularization, l2_regularization=self._l2_regularization, tree_complexity_regularization=self._tree_complexity_regularization, min_node_weight=self._min_node_weight, bias_feature_id=_BIAS_FEATURE_ID, multiclass_strategy=self._multiclass_strategy, weak_learner_type=self._weak_learner_type)) # There are no warm-up rounds needed in the equality column handler. So we # always return ready. are_splits_ready = constant_op.constant(True) return (are_splits_ready, partition_ids, gains, split_infos)
def testMakeCategoricalEqualitySplitEmptyInput(self): with self.test_session() as sess: gradients = [] hessians = [] partition_ids = [] feature_ids = [[]] partitions, gains, splits = ( split_handler_ops.build_categorical_equality_splits( num_minibatches=0, partition_ids=partition_ids, feature_ids=feature_ids, gradients=gradients, hessians=hessians, l1_regularization=0.1, l2_regularization=1, tree_complexity_regularization=0, min_node_weight=0, feature_column_group_id=0, bias_feature_id=-1, class_id=-1, multiclass_strategy=learner_pb2.LearnerConfig. TREE_PER_CLASS)) partitions, gains, splits = (sess.run([partitions, gains, splits])) self.assertEqual(0, len(partitions)) self.assertEqual(0, len(gains)) self.assertEqual(0, len(splits))
def make_splits(self, stamp_token, next_stamp_token, class_id): """Create the best split using the accumulated stats and flush the state.""" # Get the aggregated gradients and hessians per <partition_id, feature_id> # pair. num_minibatches, partition_ids, feature_ids, gradients, hessians = ( self._stats_accumulator.flush(stamp_token, next_stamp_token)) # For sum_reduction, we don't need to divide by number of minibatches. num_minibatches = control_flow_ops.cond( ops.convert_to_tensor(self._loss_uses_sum_reduction), lambda: math_ops.cast(1, dtypes.int64), lambda: num_minibatches) partition_ids, gains, split_infos = ( split_handler_ops.build_categorical_equality_splits( num_minibatches=num_minibatches, partition_ids=partition_ids, feature_ids=feature_ids, gradients=gradients, hessians=hessians, class_id=class_id, feature_column_group_id=self._feature_column_group_id, l1_regularization=self._l1_regularization, l2_regularization=self._l2_regularization, tree_complexity_regularization=self. _tree_complexity_regularization, min_node_weight=self._min_node_weight, bias_feature_id=_BIAS_FEATURE_ID, multiclass_strategy=self._multiclass_strategy, weak_learner_type=self._weak_learner_type)) # There are no warm-up rounds needed in the equality column handler. So we # always return ready. are_splits_ready = constant_op.constant(True) return (are_splits_ready, partition_ids, gains, split_infos)
def make_splits(self, stamp_token, next_stamp_token, class_id): """Create the best split using the accumulated stats and flush the state.""" # Get the aggregated gradients and hessians per <partition_id, feature_id> # pair. num_minibatches, partition_ids, feature_ids, gradients, hessians = ( self._stats_accumulator.flush(stamp_token, next_stamp_token)) partition_ids, gains, split_infos = ( split_handler_ops.build_categorical_equality_splits( num_minibatches=num_minibatches, partition_ids=partition_ids, feature_ids=feature_ids, gradients=gradients, hessians=hessians, class_id=class_id, feature_column_group_id=self._feature_column_group_id, l1_regularization=self._l1_regularization, l2_regularization=self._l2_regularization, tree_complexity_regularization=self._tree_complexity_regularization, min_node_weight=self._min_node_weight, bias_feature_id=_BIAS_FEATURE_ID, multiclass_strategy=self._multiclass_strategy,)) # There are no warm-up rounds needed in the equality column handler. So we # always return ready. are_splits_ready = constant_op.constant(True) return (are_splits_ready, partition_ids, gains, split_infos)
def testMakeCategoricalEqualitySplitEmptyInput(self): with self.test_session() as sess: gradients = [] hessians = [] partition_ids = [] feature_ids = [[]] partitions, gains, splits = ( split_handler_ops.build_categorical_equality_splits( num_minibatches=0, partition_ids=partition_ids, feature_ids=feature_ids, gradients=gradients, hessians=hessians, l1_regularization=0.1, l2_regularization=1, tree_complexity_regularization=0, min_node_weight=0, feature_column_group_id=0, bias_feature_id=-1, class_id=-1, multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS)) partitions, gains, splits = (sess.run([partitions, gains, splits])) self.assertEqual(0, len(partitions)) self.assertEqual(0, len(gains)) self.assertEqual(0, len(splits))
def make_splits(self, stamp_token, next_stamp_token, class_id): """Create the best split using the accumulated stats and flush the state.""" # Get the aggregated gradients and hessians per <partition_id, feature_id> # pair. num_minibatches, partition_ids, feature_ids, gradients, hessians = ( self._stats_accumulator.flush(stamp_token, next_stamp_token)) partition_ids, gains, split_infos = ( split_handler_ops.build_categorical_equality_splits( num_minibatches=num_minibatches, partition_ids=partition_ids, feature_ids=feature_ids, gradients=gradients, hessians=hessians, class_id=class_id, feature_column_group_id=self._feature_column_group_id, l1_regularization=self._l1_regularization, l2_regularization=self._l2_regularization, tree_complexity_regularization=self. _tree_complexity_regularization, min_node_weight=self._min_node_weight, bias_feature_id=_BIAS_FEATURE_ID, multiclass_strategy=self._multiclass_strategy, )) # There are no warm-up rounds needed in the equality column handler. So we # always return ready. are_splits_ready = constant_op.constant(True) return (are_splits_ready, partition_ids, gains, split_infos)
def testMakeMulticlassCategoricalEqualitySplit(self): """Tests split handler op for categorical equality split in multiclass.""" with self.cached_session() as sess: gradients = array_ops.constant([[1.8, 3.5], [2.4, 1.0], [0.4, 4.0], [9.0, 3.1], [3.0, 0.8]]) hessian_0 = [[0.78, 1], [12, 1]] hessian_1 = [[0.4, 1], [1, 1]] hessian_2 = [[0.24, 1], [1, 1]] hessian_3 = [[0.16, 2], [-1, 1]] hessian_4 = [[0.6, 1], [2, 1]] hessians = array_ops.constant( [hessian_0, hessian_1, hessian_2, hessian_3, hessian_4]) partition_ids = [0, 0, 0, 1, 1] feature_ids = array_ops.constant( [[-1, 0], [1, 0], [2, 0], [-1, 0], [1, 0]], dtype=dtypes.int64) partitions, gains, splits = ( split_handler_ops.build_categorical_equality_splits( num_minibatches=2, partition_ids=partition_ids, feature_ids=feature_ids, gradients=gradients, hessians=hessians, l1_regularization=0.1, l2_regularization=1, tree_complexity_regularization=0, min_node_weight=0, feature_column_group_id=0, bias_feature_id=-1, class_id=-1, multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN, weak_learner_type=learner_pb2.LearnerConfig. NORMAL_DECISION_TREE)) partitions, gains, splits = sess.run([partitions, gains, splits]) self.assertAllEqual([0, 1], partitions) split_info = split_info_pb2.SplitInfo() split_info.ParseFromString(splits[1]) left_child = split_info.left_child.vector right_child = split_info.right_child.vector split_node = split_info.split_node.categorical_id_binary_split # Each leaf has 2 element vector. self.assertEqual(2, len(left_child.value)) self.assertEqual(2, len(right_child.value)) self.assertEqual(0, split_node.feature_column) self.assertEqual(1, split_node.feature_id)
def testMakeMulticlassCategoricalEqualitySplit(self): """Tests split handler op for categorical equality split in multiclass.""" with self.test_session() as sess: gradients = array_ops.constant([[1.8, 3.5], [2.4, 1.0], [0.4, 4.0], [9.0, 3.1], [3.0, 0.8]]) hessian_0 = [[0.78, 1], [12, 1]] hessian_1 = [[0.4, 1], [1, 1]] hessian_2 = [[0.24, 1], [1, 1]] hessian_3 = [[0.16, 2], [-1, 1]] hessian_4 = [[0.6, 1], [2, 1]] hessians = array_ops.constant( [hessian_0, hessian_1, hessian_2, hessian_3, hessian_4]) partition_ids = [0, 0, 0, 1, 1] feature_ids = array_ops.constant( [[-1, 0], [1, 0], [2, 0], [-1, 0], [1, 0]], dtype=dtypes.int64) partitions, gains, splits = ( split_handler_ops.build_categorical_equality_splits( num_minibatches=2, partition_ids=partition_ids, feature_ids=feature_ids, gradients=gradients, hessians=hessians, l1_regularization=0.1, l2_regularization=1, tree_complexity_regularization=0, min_node_weight=0, feature_column_group_id=0, bias_feature_id=-1, class_id=-1, multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN)) partitions, gains, splits = sess.run([partitions, gains, splits]) self.assertAllEqual([0, 1], partitions) split_info = split_info_pb2.SplitInfo() split_info.ParseFromString(splits[1]) left_child = split_info.left_child.vector right_child = split_info.right_child.vector split_node = split_info.split_node.categorical_id_binary_split # Each leaf has 2 element vector. self.assertEqual(2, len(left_child.value)) self.assertEqual(2, len(right_child.value)) self.assertEqual(0, split_node.feature_column) self.assertEqual(1, split_node.feature_id)
def testMakeCategoricalEqualitySplit(self): """Tests split handler op for categorical equality split.""" with self.test_session() as sess: # The data looks like the following after dividing by number of steps (2). # Gradients | Partition | Feature ID | # (0.9, 0.39) | 0 | -1 | # (0.2, 0.12) | 0 | 1 | # (1.4, 0.32) | 0 | 2 | # (4.0, 0.13) | 1 | -1 | # (4.0, 0.13) | 1 | 1 | gradients = [1.8, 0.4, 2.8, 8.0, 8.0] hessians = [0.78, 0.24, 0.64, 0.26, 0.26] partition_ids = [0, 0, 0, 1, 1] feature_ids = array_ops.constant( [[-1, 0], [1, 0], [2, 0], [-1, 0], [1, 0]], dtype=dtypes.int64) partitions, gains, splits = ( split_handler_ops.build_categorical_equality_splits( num_minibatches=2, partition_ids=partition_ids, feature_ids=feature_ids, gradients=gradients, hessians=hessians, l1_regularization=0.1, l2_regularization=1, tree_complexity_regularization=0, min_node_weight=0, feature_column_group_id=0, bias_feature_id=-1, class_id=-1, multiclass_strategy=learner_pb2.LearnerConfig. TREE_PER_CLASS)) partitions, gains, splits = sess.run([partitions, gains, splits]) self.assertAllEqual([0, 1], partitions) # Check the split on partition 0. # -(0.2 + 1.2 - 0.1) / (0.12 + 0.2 + 1) expected_left_weight = -0.9848484848484846 # (0.2 + 1.2 - 0.1) ** 2 / (0.12 + 0.2 + 1) expected_left_gain = 1.2803030303030298 # -(-0.5 + 0.1) / (0.07 + 1) expected_right_weight = 0.37383177570093457 # (-0.5 + 0.1) ** 2 / (0.07 + 1) expected_right_gain = 0.14953271028037385 # (0.2 + -0.5 + 1.2 - 0.1) ** 2 / (0.12 + 0.07 + 0.2 + 1) expected_bias_gain = 0.46043165467625885 split_info = split_info_pb2.SplitInfo() split_info.ParseFromString(splits[0]) left_child = split_info.left_child.vector right_child = split_info.right_child.vector split_node = split_info.split_node.categorical_id_binary_split self.assertEqual(0, split_node.feature_column) self.assertEqual(2, split_node.feature_id) self.assertAllClose( expected_left_gain + expected_right_gain - expected_bias_gain, gains[0], 0.00001) self.assertAllClose([expected_left_weight], left_child.value, 0.00001) self.assertAllClose([expected_right_weight], right_child.value, 0.00001) # Check the split on partition 1. # (-4 + 0.1) / (0.13 + 1) expected_left_weight = -3.4513274336283186 # (-4 + 0.1) ** 2 / (0.13 + 1) expected_left_gain = 13.460176991150442 expected_right_weight = 0 expected_right_gain = 0 # (-4 + 0.1) ** 2 / (0.13 + 1) expected_bias_gain = 13.460176991150442 # Verify candidate for partition 1, there's only one active feature here # so zero gain is expected. split_info = split_info_pb2.SplitInfo() split_info.ParseFromString(splits[1]) left_child = split_info.left_child.vector right_child = split_info.right_child.vector split_node = split_info.split_node.categorical_id_binary_split self.assertAllClose(0.0, gains[1], 0.00001) self.assertAllClose([expected_left_weight], left_child.value, 0.00001) self.assertAllClose([expected_right_weight], right_child.value, 0.00001) self.assertEqual(0, split_node.feature_column) self.assertEqual(1, split_node.feature_id)
def testMakeCategoricalEqualitySplit(self): """Tests split handler op for categorical equality split.""" with self.test_session() as sess: # The data looks like the following after dividing by number of steps (2). # Gradients | Partition | Feature ID | # (0.9, 0.39) | 0 | -1 | # (0.2, 0.12) | 0 | 1 | # (1.4, 0.32) | 0 | 2 | # (4.0, 0.13) | 1 | -1 | # (4.0, 0.13) | 1 | 1 | gradients = [1.8, 0.4, 2.8, 8.0, 8.0] hessians = [0.78, 0.24, 0.64, 0.26, 0.26] partition_ids = [0, 0, 0, 1, 1] feature_ids = array_ops.constant( [[-1, 0], [1, 0], [2, 0], [-1, 0], [1, 0]], dtype=dtypes.int64) partitions, gains, splits = ( split_handler_ops.build_categorical_equality_splits( num_minibatches=2, partition_ids=partition_ids, feature_ids=feature_ids, gradients=gradients, hessians=hessians, l1_regularization=0.1, l2_regularization=1, tree_complexity_regularization=0, min_node_weight=0, feature_column_group_id=0, bias_feature_id=-1, class_id=-1, multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS)) partitions, gains, splits = sess.run([partitions, gains, splits]) self.assertAllEqual([0, 1], partitions) # Check the split on partition 0. # -(0.2 + 1.2 - 0.1) / (0.12 + 0.2 + 1) expected_left_weight = -0.9848484848484846 # (0.2 + 1.2 - 0.1) ** 2 / (0.12 + 0.2 + 1) expected_left_gain = 1.2803030303030298 # -(-0.5 + 0.1) / (0.07 + 1) expected_right_weight = 0.37383177570093457 # (-0.5 + 0.1) ** 2 / (0.07 + 1) expected_right_gain = 0.14953271028037385 # (0.2 + -0.5 + 1.2 - 0.1) ** 2 / (0.12 + 0.07 + 0.2 + 1) expected_bias_gain = 0.46043165467625885 split_info = split_info_pb2.SplitInfo() split_info.ParseFromString(splits[0]) left_child = split_info.left_child.vector right_child = split_info.right_child.vector split_node = split_info.split_node.categorical_id_binary_split self.assertEqual(0, split_node.feature_column) self.assertEqual(2, split_node.feature_id) self.assertAllClose( expected_left_gain + expected_right_gain - expected_bias_gain, gains[0], 0.00001) self.assertAllClose([expected_left_weight], left_child.value, 0.00001) self.assertAllClose([expected_right_weight], right_child.value, 0.00001) # Check the split on partition 1. # (-4 + 0.1) / (0.13 + 1) expected_left_weight = -3.4513274336283186 # (-4 + 0.1) ** 2 / (0.13 + 1) expected_left_gain = 13.460176991150442 expected_right_weight = 0 expected_right_gain = 0 # (-4 + 0.1) ** 2 / (0.13 + 1) expected_bias_gain = 13.460176991150442 # Verify candidate for partition 1, there's only one active feature here # so zero gain is expected. split_info = split_info_pb2.SplitInfo() split_info.ParseFromString(splits[1]) left_child = split_info.left_child.vector right_child = split_info.right_child.vector split_node = split_info.split_node.categorical_id_binary_split self.assertAllClose(0.0, gains[1], 0.00001) self.assertAllClose([expected_left_weight], left_child.value, 0.00001) self.assertAllClose([expected_right_weight], right_child.value, 0.00001) self.assertEqual(0, split_node.feature_column) self.assertEqual(1, split_node.feature_id)