def make_splits(self, stamp_token, next_stamp_token, class_id):
    """Create the best split using the accumulated stats and flush the state."""
    # Get the aggregated gradients and hessians per <partition_id, feature_id>
    # pair.
    num_minibatches, partition_ids, feature_ids, gradients, hessians = (
        self._stats_accumulator.flush(stamp_token, next_stamp_token))
    # For sum_reduction, we don't need to divide by number of minibatches.

    num_minibatches = control_flow_ops.cond(
        ops.convert_to_tensor(self._loss_uses_sum_reduction),
        lambda: math_ops.to_int64(1), lambda: num_minibatches)
    partition_ids, gains, split_infos = (
        split_handler_ops.build_categorical_equality_splits(
            num_minibatches=num_minibatches,
            partition_ids=partition_ids,
            feature_ids=feature_ids,
            gradients=gradients,
            hessians=hessians,
            class_id=class_id,
            feature_column_group_id=self._feature_column_group_id,
            l1_regularization=self._l1_regularization,
            l2_regularization=self._l2_regularization,
            tree_complexity_regularization=self._tree_complexity_regularization,
            min_node_weight=self._min_node_weight,
            bias_feature_id=_BIAS_FEATURE_ID,
            multiclass_strategy=self._multiclass_strategy,
            weak_learner_type=self._weak_learner_type))
    # There are no warm-up rounds needed in the equality column handler. So we
    # always return ready.
    are_splits_ready = constant_op.constant(True)
    return (are_splits_ready, partition_ids, gains, split_infos)
Example #2
0
 def testMakeCategoricalEqualitySplitEmptyInput(self):
     with self.test_session() as sess:
         gradients = []
         hessians = []
         partition_ids = []
         feature_ids = [[]]
         partitions, gains, splits = (
             split_handler_ops.build_categorical_equality_splits(
                 num_minibatches=0,
                 partition_ids=partition_ids,
                 feature_ids=feature_ids,
                 gradients=gradients,
                 hessians=hessians,
                 l1_regularization=0.1,
                 l2_regularization=1,
                 tree_complexity_regularization=0,
                 min_node_weight=0,
                 feature_column_group_id=0,
                 bias_feature_id=-1,
                 class_id=-1,
                 multiclass_strategy=learner_pb2.LearnerConfig.
                 TREE_PER_CLASS))
         partitions, gains, splits = (sess.run([partitions, gains, splits]))
     self.assertEqual(0, len(partitions))
     self.assertEqual(0, len(gains))
     self.assertEqual(0, len(splits))
Example #3
0
    def make_splits(self, stamp_token, next_stamp_token, class_id):
        """Create the best split using the accumulated stats and flush the state."""
        # Get the aggregated gradients and hessians per <partition_id, feature_id>
        # pair.
        num_minibatches, partition_ids, feature_ids, gradients, hessians = (
            self._stats_accumulator.flush(stamp_token, next_stamp_token))
        # For sum_reduction, we don't need to divide by number of minibatches.

        num_minibatches = control_flow_ops.cond(
            ops.convert_to_tensor(self._loss_uses_sum_reduction),
            lambda: math_ops.cast(1, dtypes.int64), lambda: num_minibatches)
        partition_ids, gains, split_infos = (
            split_handler_ops.build_categorical_equality_splits(
                num_minibatches=num_minibatches,
                partition_ids=partition_ids,
                feature_ids=feature_ids,
                gradients=gradients,
                hessians=hessians,
                class_id=class_id,
                feature_column_group_id=self._feature_column_group_id,
                l1_regularization=self._l1_regularization,
                l2_regularization=self._l2_regularization,
                tree_complexity_regularization=self.
                _tree_complexity_regularization,
                min_node_weight=self._min_node_weight,
                bias_feature_id=_BIAS_FEATURE_ID,
                multiclass_strategy=self._multiclass_strategy,
                weak_learner_type=self._weak_learner_type))
        # There are no warm-up rounds needed in the equality column handler. So we
        # always return ready.
        are_splits_ready = constant_op.constant(True)
        return (are_splits_ready, partition_ids, gains, split_infos)
 def make_splits(self, stamp_token, next_stamp_token, class_id):
   """Create the best split using the accumulated stats and flush the state."""
   # Get the aggregated gradients and hessians per <partition_id, feature_id>
   # pair.
   num_minibatches, partition_ids, feature_ids, gradients, hessians = (
       self._stats_accumulator.flush(stamp_token, next_stamp_token))
   partition_ids, gains, split_infos = (
       split_handler_ops.build_categorical_equality_splits(
           num_minibatches=num_minibatches,
           partition_ids=partition_ids,
           feature_ids=feature_ids,
           gradients=gradients,
           hessians=hessians,
           class_id=class_id,
           feature_column_group_id=self._feature_column_group_id,
           l1_regularization=self._l1_regularization,
           l2_regularization=self._l2_regularization,
           tree_complexity_regularization=self._tree_complexity_regularization,
           min_node_weight=self._min_node_weight,
           bias_feature_id=_BIAS_FEATURE_ID,
           multiclass_strategy=self._multiclass_strategy,))
   # There are no warm-up rounds needed in the equality column handler. So we
   # always return ready.
   are_splits_ready = constant_op.constant(True)
   return (are_splits_ready, partition_ids, gains, split_infos)
 def testMakeCategoricalEqualitySplitEmptyInput(self):
   with self.test_session() as sess:
     gradients = []
     hessians = []
     partition_ids = []
     feature_ids = [[]]
     partitions, gains, splits = (
         split_handler_ops.build_categorical_equality_splits(
             num_minibatches=0,
             partition_ids=partition_ids,
             feature_ids=feature_ids,
             gradients=gradients,
             hessians=hessians,
             l1_regularization=0.1,
             l2_regularization=1,
             tree_complexity_regularization=0,
             min_node_weight=0,
             feature_column_group_id=0,
             bias_feature_id=-1,
             class_id=-1,
             multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
     partitions, gains, splits = (sess.run([partitions, gains, splits]))
   self.assertEqual(0, len(partitions))
   self.assertEqual(0, len(gains))
   self.assertEqual(0, len(splits))
 def make_splits(self, stamp_token, next_stamp_token, class_id):
     """Create the best split using the accumulated stats and flush the state."""
     # Get the aggregated gradients and hessians per <partition_id, feature_id>
     # pair.
     num_minibatches, partition_ids, feature_ids, gradients, hessians = (
         self._stats_accumulator.flush(stamp_token, next_stamp_token))
     partition_ids, gains, split_infos = (
         split_handler_ops.build_categorical_equality_splits(
             num_minibatches=num_minibatches,
             partition_ids=partition_ids,
             feature_ids=feature_ids,
             gradients=gradients,
             hessians=hessians,
             class_id=class_id,
             feature_column_group_id=self._feature_column_group_id,
             l1_regularization=self._l1_regularization,
             l2_regularization=self._l2_regularization,
             tree_complexity_regularization=self.
             _tree_complexity_regularization,
             min_node_weight=self._min_node_weight,
             bias_feature_id=_BIAS_FEATURE_ID,
             multiclass_strategy=self._multiclass_strategy,
         ))
     # There are no warm-up rounds needed in the equality column handler. So we
     # always return ready.
     are_splits_ready = constant_op.constant(True)
     return (are_splits_ready, partition_ids, gains, split_infos)
Example #7
0
    def testMakeMulticlassCategoricalEqualitySplit(self):
        """Tests split handler op for categorical equality split in multiclass."""
        with self.cached_session() as sess:
            gradients = array_ops.constant([[1.8, 3.5], [2.4, 1.0], [0.4, 4.0],
                                            [9.0, 3.1], [3.0, 0.8]])

            hessian_0 = [[0.78, 1], [12, 1]]
            hessian_1 = [[0.4, 1], [1, 1]]
            hessian_2 = [[0.24, 1], [1, 1]]
            hessian_3 = [[0.16, 2], [-1, 1]]
            hessian_4 = [[0.6, 1], [2, 1]]

            hessians = array_ops.constant(
                [hessian_0, hessian_1, hessian_2, hessian_3, hessian_4])
            partition_ids = [0, 0, 0, 1, 1]
            feature_ids = array_ops.constant(
                [[-1, 0], [1, 0], [2, 0], [-1, 0], [1, 0]], dtype=dtypes.int64)
            partitions, gains, splits = (
                split_handler_ops.build_categorical_equality_splits(
                    num_minibatches=2,
                    partition_ids=partition_ids,
                    feature_ids=feature_ids,
                    gradients=gradients,
                    hessians=hessians,
                    l1_regularization=0.1,
                    l2_regularization=1,
                    tree_complexity_regularization=0,
                    min_node_weight=0,
                    feature_column_group_id=0,
                    bias_feature_id=-1,
                    class_id=-1,
                    multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN,
                    weak_learner_type=learner_pb2.LearnerConfig.
                    NORMAL_DECISION_TREE))
            partitions, gains, splits = sess.run([partitions, gains, splits])
        self.assertAllEqual([0, 1], partitions)

        split_info = split_info_pb2.SplitInfo()
        split_info.ParseFromString(splits[1])
        left_child = split_info.left_child.vector
        right_child = split_info.right_child.vector
        split_node = split_info.split_node.categorical_id_binary_split

        # Each leaf has 2 element vector.
        self.assertEqual(2, len(left_child.value))
        self.assertEqual(2, len(right_child.value))

        self.assertEqual(0, split_node.feature_column)
        self.assertEqual(1, split_node.feature_id)
  def testMakeMulticlassCategoricalEqualitySplit(self):
    """Tests split handler op for categorical equality split in multiclass."""
    with self.test_session() as sess:
      gradients = array_ops.constant([[1.8, 3.5], [2.4, 1.0], [0.4, 4.0],
                                      [9.0, 3.1], [3.0, 0.8]])

      hessian_0 = [[0.78, 1], [12, 1]]
      hessian_1 = [[0.4, 1], [1, 1]]
      hessian_2 = [[0.24, 1], [1, 1]]
      hessian_3 = [[0.16, 2], [-1, 1]]
      hessian_4 = [[0.6, 1], [2, 1]]

      hessians = array_ops.constant(
          [hessian_0, hessian_1, hessian_2, hessian_3, hessian_4])
      partition_ids = [0, 0, 0, 1, 1]
      feature_ids = array_ops.constant(
          [[-1, 0], [1, 0], [2, 0], [-1, 0], [1, 0]], dtype=dtypes.int64)
      partitions, gains, splits = (
          split_handler_ops.build_categorical_equality_splits(
              num_minibatches=2,
              partition_ids=partition_ids,
              feature_ids=feature_ids,
              gradients=gradients,
              hessians=hessians,
              l1_regularization=0.1,
              l2_regularization=1,
              tree_complexity_regularization=0,
              min_node_weight=0,
              feature_column_group_id=0,
              bias_feature_id=-1,
              class_id=-1,
              multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN))
      partitions, gains, splits = sess.run([partitions, gains, splits])
    self.assertAllEqual([0, 1], partitions)

    split_info = split_info_pb2.SplitInfo()
    split_info.ParseFromString(splits[1])
    left_child = split_info.left_child.vector
    right_child = split_info.right_child.vector
    split_node = split_info.split_node.categorical_id_binary_split

    # Each leaf has 2 element vector.
    self.assertEqual(2, len(left_child.value))
    self.assertEqual(2, len(right_child.value))

    self.assertEqual(0, split_node.feature_column)
    self.assertEqual(1, split_node.feature_id)
Example #9
0
    def testMakeCategoricalEqualitySplit(self):
        """Tests split handler op for categorical equality split."""
        with self.test_session() as sess:
            # The data looks like the following after dividing by number of steps (2).
            # Gradients    | Partition | Feature ID     |
            # (0.9, 0.39)  | 0         | -1             |
            # (0.2, 0.12)  | 0         | 1              |
            # (1.4, 0.32)  | 0         | 2              |
            # (4.0, 0.13)  | 1         | -1             |
            # (4.0, 0.13)  | 1         | 1              |
            gradients = [1.8, 0.4, 2.8, 8.0, 8.0]
            hessians = [0.78, 0.24, 0.64, 0.26, 0.26]
            partition_ids = [0, 0, 0, 1, 1]
            feature_ids = array_ops.constant(
                [[-1, 0], [1, 0], [2, 0], [-1, 0], [1, 0]], dtype=dtypes.int64)
            partitions, gains, splits = (
                split_handler_ops.build_categorical_equality_splits(
                    num_minibatches=2,
                    partition_ids=partition_ids,
                    feature_ids=feature_ids,
                    gradients=gradients,
                    hessians=hessians,
                    l1_regularization=0.1,
                    l2_regularization=1,
                    tree_complexity_regularization=0,
                    min_node_weight=0,
                    feature_column_group_id=0,
                    bias_feature_id=-1,
                    class_id=-1,
                    multiclass_strategy=learner_pb2.LearnerConfig.
                    TREE_PER_CLASS))
            partitions, gains, splits = sess.run([partitions, gains, splits])
        self.assertAllEqual([0, 1], partitions)

        # Check the split on partition 0.
        # -(0.2 + 1.2 - 0.1) / (0.12 + 0.2 + 1)
        expected_left_weight = -0.9848484848484846

        # (0.2 + 1.2 - 0.1) ** 2 / (0.12 + 0.2 + 1)
        expected_left_gain = 1.2803030303030298

        # -(-0.5 + 0.1) / (0.07 + 1)
        expected_right_weight = 0.37383177570093457

        # (-0.5 + 0.1) ** 2 / (0.07 + 1)
        expected_right_gain = 0.14953271028037385

        # (0.2 + -0.5 + 1.2 - 0.1) ** 2 / (0.12 + 0.07 + 0.2 + 1)
        expected_bias_gain = 0.46043165467625885

        split_info = split_info_pb2.SplitInfo()
        split_info.ParseFromString(splits[0])
        left_child = split_info.left_child.vector
        right_child = split_info.right_child.vector
        split_node = split_info.split_node.categorical_id_binary_split

        self.assertEqual(0, split_node.feature_column)

        self.assertEqual(2, split_node.feature_id)

        self.assertAllClose(
            expected_left_gain + expected_right_gain - expected_bias_gain,
            gains[0], 0.00001)

        self.assertAllClose([expected_left_weight], left_child.value, 0.00001)

        self.assertAllClose([expected_right_weight], right_child.value,
                            0.00001)

        # Check the split on partition 1.
        # (-4 + 0.1) / (0.13 + 1)
        expected_left_weight = -3.4513274336283186
        # (-4 + 0.1) ** 2 / (0.13 + 1)
        expected_left_gain = 13.460176991150442
        expected_right_weight = 0
        expected_right_gain = 0
        # (-4 + 0.1) ** 2 / (0.13 + 1)
        expected_bias_gain = 13.460176991150442

        # Verify candidate for partition 1, there's only one active feature here
        # so zero gain is expected.
        split_info = split_info_pb2.SplitInfo()
        split_info.ParseFromString(splits[1])
        left_child = split_info.left_child.vector
        right_child = split_info.right_child.vector
        split_node = split_info.split_node.categorical_id_binary_split
        self.assertAllClose(0.0, gains[1], 0.00001)

        self.assertAllClose([expected_left_weight], left_child.value, 0.00001)

        self.assertAllClose([expected_right_weight], right_child.value,
                            0.00001)

        self.assertEqual(0, split_node.feature_column)

        self.assertEqual(1, split_node.feature_id)
  def testMakeCategoricalEqualitySplit(self):
    """Tests split handler op for categorical equality split."""
    with self.test_session() as sess:
      # The data looks like the following after dividing by number of steps (2).
      # Gradients    | Partition | Feature ID     |
      # (0.9, 0.39)  | 0         | -1             |
      # (0.2, 0.12)  | 0         | 1              |
      # (1.4, 0.32)  | 0         | 2              |
      # (4.0, 0.13)  | 1         | -1             |
      # (4.0, 0.13)  | 1         | 1              |
      gradients = [1.8, 0.4, 2.8, 8.0, 8.0]
      hessians = [0.78, 0.24, 0.64, 0.26, 0.26]
      partition_ids = [0, 0, 0, 1, 1]
      feature_ids = array_ops.constant(
          [[-1, 0], [1, 0], [2, 0], [-1, 0], [1, 0]], dtype=dtypes.int64)
      partitions, gains, splits = (
          split_handler_ops.build_categorical_equality_splits(
              num_minibatches=2,
              partition_ids=partition_ids,
              feature_ids=feature_ids,
              gradients=gradients,
              hessians=hessians,
              l1_regularization=0.1,
              l2_regularization=1,
              tree_complexity_regularization=0,
              min_node_weight=0,
              feature_column_group_id=0,
              bias_feature_id=-1,
              class_id=-1,
              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
      partitions, gains, splits = sess.run([partitions, gains, splits])
    self.assertAllEqual([0, 1], partitions)

    # Check the split on partition 0.
    # -(0.2 + 1.2 - 0.1) / (0.12 + 0.2 + 1)
    expected_left_weight = -0.9848484848484846

    # (0.2 + 1.2 - 0.1) ** 2 / (0.12 + 0.2 + 1)
    expected_left_gain = 1.2803030303030298

    # -(-0.5 + 0.1) / (0.07 + 1)
    expected_right_weight = 0.37383177570093457

    # (-0.5 + 0.1) ** 2 / (0.07 + 1)
    expected_right_gain = 0.14953271028037385

    # (0.2 + -0.5 + 1.2 - 0.1) ** 2 / (0.12 + 0.07 + 0.2 + 1)
    expected_bias_gain = 0.46043165467625885

    split_info = split_info_pb2.SplitInfo()
    split_info.ParseFromString(splits[0])
    left_child = split_info.left_child.vector
    right_child = split_info.right_child.vector
    split_node = split_info.split_node.categorical_id_binary_split

    self.assertEqual(0, split_node.feature_column)

    self.assertEqual(2, split_node.feature_id)

    self.assertAllClose(
        expected_left_gain + expected_right_gain - expected_bias_gain, gains[0],
        0.00001)

    self.assertAllClose([expected_left_weight], left_child.value, 0.00001)

    self.assertAllClose([expected_right_weight], right_child.value, 0.00001)

    # Check the split on partition 1.
    # (-4 + 0.1) / (0.13 + 1)
    expected_left_weight = -3.4513274336283186
    # (-4 + 0.1) ** 2 / (0.13 + 1)
    expected_left_gain = 13.460176991150442
    expected_right_weight = 0
    expected_right_gain = 0
    # (-4 + 0.1) ** 2 / (0.13 + 1)
    expected_bias_gain = 13.460176991150442

    # Verify candidate for partition 1, there's only one active feature here
    # so zero gain is expected.
    split_info = split_info_pb2.SplitInfo()
    split_info.ParseFromString(splits[1])
    left_child = split_info.left_child.vector
    right_child = split_info.right_child.vector
    split_node = split_info.split_node.categorical_id_binary_split
    self.assertAllClose(0.0, gains[1], 0.00001)

    self.assertAllClose([expected_left_weight], left_child.value, 0.00001)

    self.assertAllClose([expected_right_weight], right_child.value, 0.00001)

    self.assertEqual(0, split_node.feature_column)

    self.assertEqual(1, split_node.feature_id)