Ejemplo n.º 1
0
 def testMakeSparseSplitAllEmptyDimensions(self):
     """Tests split handler op when all dimensions have only bias bucket id."""
     with self.test_session() as sess:
         # The data looks like the following after dividing by number of steps (2).
         # Gradients    | Partition | Dimension | bucket ID       |
         # (0.9, 0.39)  | 0         |    0      |  -1             |
         # (4.0, 0.13)  | 1         |    0      |  -1             |
         partition_ids = array_ops.constant([0, 1], dtype=dtypes.int32)
         # We have only 1 dimension in our sparse feature column.
         bucket_ids = array_ops.constant([[-1, 0], [-1, 0]],
                                         dtype=dtypes.int64)
         gradients = array_ops.constant([1.8, 8.0])
         hessians = array_ops.constant([0.78, 0.26])
         bucket_boundaries = array_ops.constant([0.3, 0.52])
         partitions, gains, splits = (
             split_handler_ops.build_sparse_inequality_splits(
                 num_minibatches=2,
                 partition_ids=partition_ids,
                 bucket_ids=bucket_ids,
                 gradients=gradients,
                 hessians=hessians,
                 bucket_boundaries=bucket_boundaries,
                 l1_regularization=0,
                 l2_regularization=2,
                 tree_complexity_regularization=0,
                 min_node_weight=0,
                 feature_column_group_id=0,
                 bias_feature_id=-1,
                 class_id=-1,
                 multiclass_strategy=learner_pb2.LearnerConfig.
                 TREE_PER_CLASS))
         partitions, gains, splits = (sess.run([partitions, gains, splits]))
     self.assertEqual(0, len(partitions))
     self.assertEqual(0, len(splits))
 def testMakeSparseSplitAllEmptyDimensions(self):
   """Tests split handler op when all dimensions have only bias bucket id."""
   with self.test_session() as sess:
     # The data looks like the following after dividing by number of steps (2).
     # Gradients    | Partition | Dimension | bucket ID       |
     # (0.9, 0.39)  | 0         |    0      |  -1             |
     # (4.0, 0.13)  | 1         |    0      |  -1             |
     partition_ids = array_ops.constant([0, 1], dtype=dtypes.int32)
     # We have only 1 dimension in our sparse feature column.
     bucket_ids = array_ops.constant([[-1, 0], [-1, 0]], dtype=dtypes.int64)
     gradients = array_ops.constant([1.8, 8.0])
     hessians = array_ops.constant([0.78, 0.26])
     bucket_boundaries = array_ops.constant([0.3, 0.52])
     partitions, gains, splits = (
         split_handler_ops.build_sparse_inequality_splits(
             num_minibatches=2,
             partition_ids=partition_ids,
             bucket_ids=bucket_ids,
             gradients=gradients,
             hessians=hessians,
             bucket_boundaries=bucket_boundaries,
             l1_regularization=0,
             l2_regularization=2,
             tree_complexity_regularization=0,
             min_node_weight=0,
             feature_column_group_id=0,
             bias_feature_id=-1,
             class_id=-1,
             multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
     partitions, gains, splits = (sess.run([partitions, gains, splits]))
   self.assertEqual(0, len(partitions))
   self.assertEqual(0, len(splits))
  def testMakeSparseSplitDefaultDirectionIsStable(self):
    """Tests default direction is stable when no sparsity."""
    random.seed(1123)
    for _ in range(50):
      with self.test_session() as sess:
        grad = random.random()
        hessian = random.random()
        # The data looks like the following (divide by the num of steps 2).
        # Gradients       | Partition | bucket ID       |
        # (grad, hessian) |  0        | -1              |
        # And then 100 buckets of
        # (grad/100, hessian/100), so there is no sparsity.
        n_buckets = 100

        # 1 for the overall sum, and 100 buckets.
        partition_ids = array_ops.constant(
            [0] * (n_buckets + 1), dtype=dtypes.int32)
        # We have only 1 dimension in our sparse feature column.

        bucket_ids = [-1] + [n for n in range(100)]
        bucket_ids = array_ops.constant(bucket_ids, dtype=dtypes.int64)
        dimension_ids = array_ops.constant(
            [0] * (n_buckets + 1), dtype=dtypes.int64)
        bucket_ids = array_ops.stack([bucket_ids, dimension_ids], axis=1)

        gradients = [grad] + [grad / n_buckets] * n_buckets
        gradients = array_ops.constant(gradients)
        hessians = [hessian] + [hessian / n_buckets] * n_buckets
        hessians = array_ops.constant(hessians)

        boundaries = [x * 1 for x in range(n_buckets + 1)]
        bucket_boundaries = array_ops.constant(boundaries, dtype=dtypes.float32)

        partitions, gains, splits = (
            split_handler_ops.build_sparse_inequality_splits(
                num_minibatches=2,
                partition_ids=partition_ids,
                bucket_ids=bucket_ids,
                gradients=gradients,
                hessians=hessians,
                bucket_boundaries=bucket_boundaries,
                l1_regularization=0,
                l2_regularization=2,
                tree_complexity_regularization=0,
                min_node_weight=0,
                feature_column_group_id=0,
                bias_feature_id=-1,
                class_id=-1,
                multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
        partitions, gains, splits = (sess.run([partitions, gains, splits]))
      self.assertAllEqual([0], partitions)
      self.assertEqual(1, len(splits))

      split_info = split_info_pb2.SplitInfo()
      split_info.ParseFromString(splits[0])
      self.assertTrue(
          split_info.split_node.HasField(
              'sparse_float_binary_split_default_left'))
Ejemplo n.º 4
0
  def testMakeSparseSplitDefaultDirectionIsStable(self):
    """Tests default direction is stable when no sparsity."""
    random.seed(1123)
    for _ in range(50):
      with self.test_session() as sess:
        grad = random.random()
        hessian = random.random()
        # The data looks like the following (divide by the num of steps 2).
        # Gradients       | Partition | bucket ID       |
        # (grad, hessian) |  0        | -1              |
        # And then 100 buckets of
        # (grad/100, hessian/100), so there is no sparsity.
        n_buckets = 100

        # 1 for the overall sum, and 100 buckets.
        partition_ids = array_ops.constant(
            [0] * (n_buckets + 1), dtype=dtypes.int32)
        # We have only 1 dimension in our sparse feature column.

        bucket_ids = [-1] + [n for n in range(100)]
        bucket_ids = array_ops.constant(bucket_ids, dtype=dtypes.int64)
        dimension_ids = array_ops.constant(
            [0] * (n_buckets + 1), dtype=dtypes.int64)
        bucket_ids = array_ops.stack([bucket_ids, dimension_ids], axis=1)

        gradients = [grad] + [grad / n_buckets] * n_buckets
        gradients = array_ops.constant(gradients)
        hessians = [hessian] + [hessian / n_buckets] * n_buckets
        hessians = array_ops.constant(hessians)

        boundaries = [x * 1 for x in range(n_buckets + 1)]
        bucket_boundaries = array_ops.constant(boundaries, dtype=dtypes.float32)

        partitions, gains, splits = (
            split_handler_ops.build_sparse_inequality_splits(
                num_minibatches=2,
                partition_ids=partition_ids,
                bucket_ids=bucket_ids,
                gradients=gradients,
                hessians=hessians,
                bucket_boundaries=bucket_boundaries,
                l1_regularization=0,
                l2_regularization=2,
                tree_complexity_regularization=0,
                min_node_weight=0,
                feature_column_group_id=0,
                bias_feature_id=-1,
                class_id=-1,
                multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
        partitions, gains, splits = (sess.run([partitions, gains, splits]))
      self.assertAllEqual([0], partitions)
      self.assertEqual(1, len(splits))

      split_info = split_info_pb2.SplitInfo()
      split_info.ParseFromString(splits[0])
      self.assertTrue(
          split_info.split_node.HasField(
              'sparse_float_binary_split_default_left'))
def _make_sparse_split(
    quantile_accumulator_handle, stats_accumulator_handle, stamp_token,
    next_stamp_token, multiclass_strategy, class_id, feature_column_id,
    l1_regularization, l2_regularization, tree_complexity_regularization,
    min_node_weight, is_multi_dimentional, loss_uses_sum_reduction):
  """Function that builds splits for a sparse feature column."""
  # Get the bucket boundaries
  are_splits_ready, buckets = (
      gen_quantile_ops.quantile_accumulator_get_buckets(
          quantile_accumulator_handles=[quantile_accumulator_handle],
          stamp_token=stamp_token))
  # quantile_accumulator_get_buckets returns a list of results per handle that
  # we pass to it. In this case we're getting results just for one resource.
  are_splits_ready = are_splits_ready[0]
  buckets = buckets[0]

  # After we receive the boundaries from previous iteration we can flush
  # the quantile accumulator.
  with ops.control_dependencies([buckets]):
    flush_quantiles = gen_quantile_ops.quantile_accumulator_flush(
        quantile_accumulator_handle=quantile_accumulator_handle,
        stamp_token=stamp_token,
        next_stamp_token=next_stamp_token)

  if is_multi_dimentional:
    num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
        gen_stats_accumulator_ops.stats_accumulator_tensor_flush(
            stats_accumulator_handle, stamp_token, next_stamp_token))
  else:
    num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
        gen_stats_accumulator_ops.stats_accumulator_scalar_flush(
            stats_accumulator_handle, stamp_token, next_stamp_token))
  num_minibatches = control_flow_ops.cond(
      loss_uses_sum_reduction,
      lambda: math_ops.cast(1, dtypes.int64),
      lambda: num_minibatches)
  # Put quantile and stats accumulator flushing in the dependency path.
  with ops.control_dependencies([flush_quantiles, partition_ids]):
    are_splits_ready = array_ops.identity(are_splits_ready)
  partition_ids, gains, split_infos = (
      split_handler_ops.build_sparse_inequality_splits(
          num_minibatches=num_minibatches,
          bucket_boundaries=buckets,
          partition_ids=partition_ids,
          bucket_ids=bucket_ids,
          gradients=gradients,
          hessians=hessians,
          class_id=class_id,
          feature_column_group_id=feature_column_id,
          l1_regularization=l1_regularization,
          l2_regularization=l2_regularization,
          tree_complexity_regularization=tree_complexity_regularization,
          min_node_weight=min_node_weight,
          bias_feature_id=_BIAS_FEATURE_ID,
          multiclass_strategy=multiclass_strategy))
  return are_splits_ready, partition_ids, gains, split_infos
def _make_sparse_split(quantile_accumulator_handle, stats_accumulator_handle,
                       stamp_token, next_stamp_token, multiclass_strategy,
                       class_id, feature_column_id, l1_regularization,
                       l2_regularization, tree_complexity_regularization,
                       min_node_weight, is_multi_dimentional,
                       loss_uses_sum_reduction):
    """Function that builds splits for a sparse feature column."""
    # Get the bucket boundaries
    are_splits_ready, buckets = (
        gen_quantile_ops.quantile_accumulator_get_buckets(
            quantile_accumulator_handles=[quantile_accumulator_handle],
            stamp_token=stamp_token))
    # quantile_accumulator_get_buckets returns a list of results per handle that
    # we pass to it. In this case we're getting results just for one resource.
    are_splits_ready = are_splits_ready[0]
    buckets = buckets[0]

    # After we receive the boundaries from previous iteration we can flush
    # the quantile accumulator.
    with ops.control_dependencies([buckets]):
        flush_quantiles = gen_quantile_ops.quantile_accumulator_flush(
            quantile_accumulator_handle=quantile_accumulator_handle,
            stamp_token=stamp_token,
            next_stamp_token=next_stamp_token)

    if is_multi_dimentional:
        num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
            gen_stats_accumulator_ops.stats_accumulator_tensor_flush(
                stats_accumulator_handle, stamp_token, next_stamp_token))
    else:
        num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
            gen_stats_accumulator_ops.stats_accumulator_scalar_flush(
                stats_accumulator_handle, stamp_token, next_stamp_token))
    num_minibatches = control_flow_ops.cond(loss_uses_sum_reduction,
                                            lambda: math_ops.to_int64(1),
                                            lambda: num_minibatches)
    # Put quantile and stats accumulator flushing in the dependency path.
    with ops.control_dependencies([flush_quantiles, partition_ids]):
        are_splits_ready = array_ops.identity(are_splits_ready)
    partition_ids, gains, split_infos = (
        split_handler_ops.build_sparse_inequality_splits(
            num_minibatches=num_minibatches,
            bucket_boundaries=buckets,
            partition_ids=partition_ids,
            bucket_ids=bucket_ids,
            gradients=gradients,
            hessians=hessians,
            class_id=class_id,
            feature_column_group_id=feature_column_id,
            l1_regularization=l1_regularization,
            l2_regularization=l2_regularization,
            tree_complexity_regularization=tree_complexity_regularization,
            min_node_weight=min_node_weight,
            bias_feature_id=_BIAS_FEATURE_ID,
            multiclass_strategy=multiclass_strategy))
    return are_splits_ready, partition_ids, gains, split_infos
Ejemplo n.º 7
0
    def testMakeMulticlassSparseSplit(self):
        """Tests split handler op."""
        with self.test_session() as sess:
            partition_ids = array_ops.constant([0, 0, 0, 1, 1],
                                               dtype=dtypes.int32)
        bucket_ids = array_ops.constant(
            [[-1, 0], [0, 0], [1, 0], [-1, 0], [1, 0]], dtype=dtypes.int64)
        gradients = array_ops.constant([[1.8, 3.5], [2.4, 1.0], [0.4, 4.0],
                                        [8.0, 3.1], [8.0, 0.8]])

        hessian_0 = [[0.78, 1], [12, 1]]
        hessian_1 = [[0.4, 1], [1, 1]]
        hessian_2 = [[0.24, 1], [1, 1]]
        hessian_3 = [[0.26, 1], [1, 1]]
        hessian_4 = [[0.26, 1], [1, 1]]

        hessians = array_ops.constant(
            [hessian_0, hessian_1, hessian_2, hessian_3, hessian_4])
        bucket_boundaries = array_ops.constant([0.3, 0.52])
        partitions, gains, splits = (
            split_handler_ops.build_sparse_inequality_splits(
                num_minibatches=2,
                partition_ids=partition_ids,
                bucket_ids=bucket_ids,
                gradients=gradients,
                hessians=hessians,
                bucket_boundaries=bucket_boundaries,
                l1_regularization=0,
                l2_regularization=2,
                tree_complexity_regularization=0,
                min_node_weight=0,
                feature_column_group_id=0,
                bias_feature_id=-1,
                class_id=-1,
                multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN))
        partitions, gains, splits = (sess.run([partitions, gains, splits]))

        split_info = split_info_pb2.SplitInfo()
        split_info.ParseFromString(splits[0])
        left_child = split_info.left_child.vector
        right_child = split_info.right_child.vector
        split_node = split_info.split_node.sparse_float_binary_split_default_right

        # Each leaf has 2 element vector.
        self.assertEqual(2, len(left_child.value))
        self.assertEqual(2, len(right_child.value))

        self.assertEqual(0, split_node.split.feature_column)
        self.assertAllClose(0.52, split_node.split.threshold)
  def testMakeMulticlassSparseSplit(self):
    """Tests split handler op."""
    with self.test_session() as sess:
      partition_ids = array_ops.constant([0, 0, 0, 1, 1], dtype=dtypes.int32)
    bucket_ids = array_ops.constant(
        [[-1, 0], [0, 0], [1, 0], [-1, 0], [1, 0]], dtype=dtypes.int64)
    gradients = array_ops.constant([[1.8, 3.5], [2.4, 1.0], [0.4, 4.0],
                                    [8.0, 3.1], [8.0, 0.8]])

    hessian_0 = [[0.78, 1], [12, 1]]
    hessian_1 = [[0.4, 1], [1, 1]]
    hessian_2 = [[0.24, 1], [1, 1]]
    hessian_3 = [[0.26, 1], [1, 1]]
    hessian_4 = [[0.26, 1], [1, 1]]

    hessians = array_ops.constant(
        [hessian_0, hessian_1, hessian_2, hessian_3, hessian_4])
    bucket_boundaries = array_ops.constant([0.3, 0.52])
    partitions, gains, splits = (
        split_handler_ops.build_sparse_inequality_splits(
            num_minibatches=2,
            partition_ids=partition_ids,
            bucket_ids=bucket_ids,
            gradients=gradients,
            hessians=hessians,
            bucket_boundaries=bucket_boundaries,
            l1_regularization=0,
            l2_regularization=2,
            tree_complexity_regularization=0,
            min_node_weight=0,
            feature_column_group_id=0,
            bias_feature_id=-1,
            class_id=-1,
            multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN))
    partitions, gains, splits = (sess.run([partitions, gains, splits]))

    split_info = split_info_pb2.SplitInfo()
    split_info.ParseFromString(splits[0])
    left_child = split_info.left_child.vector
    right_child = split_info.right_child.vector
    split_node = split_info.split_node.sparse_float_binary_split_default_right

    # Each leaf has 2 element vector.
    self.assertEqual(2, len(left_child.value))
    self.assertEqual(2, len(right_child.value))

    self.assertEqual(0, split_node.split.feature_column)
    self.assertAllClose(0.52, split_node.split.threshold)
Ejemplo n.º 9
0
    def make_splits(self, stamp_token, next_stamp_token, class_id):
        """Create the best split using the accumulated stats and flush the state."""
        # Get the bucket boundaries
        are_splits_ready, buckets = (
            self._quantile_accumulator.get_buckets(stamp_token))

        # After we receive the boundaries from previous iteration we can flush
        # the quantile accumulator.
        with ops.control_dependencies([buckets]):
            flush_quantiles = self._quantile_accumulator.flush(
                stamp_token=stamp_token, next_stamp_token=next_stamp_token)

        with ops.device(None):
            with ops.device(self._stats_accumulator.resource().device):
                num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
                    self._stats_accumulator.flush(stamp_token,
                                                  next_stamp_token))

                # Put quantile and stats accumulator flushing in the dependency path.
                are_splits_ready = control_flow_ops.with_dependencies(
                    [flush_quantiles, partition_ids], are_splits_ready)
                partition_ids, gains, split_infos = (
                    split_handler_ops.build_sparse_inequality_splits(
                        num_minibatches=num_minibatches,
                        bucket_boundaries=buckets,
                        partition_ids=partition_ids,
                        bucket_ids=bucket_ids,
                        gradients=gradients,
                        hessians=hessians,
                        class_id=class_id,
                        feature_column_group_id=self._feature_column_group_id,
                        l1_regularization=self._l1_regularization,
                        l2_regularization=self._l2_regularization,
                        tree_complexity_regularization=self.
                        _tree_complexity_regularization,
                        min_node_weight=self._min_node_weight,
                        bias_feature_id=_BIAS_FEATURE_ID,
                        multiclass_strategy=self._multiclass_strategy))
        return (are_splits_ready, partition_ids, gains, split_infos)
Ejemplo n.º 10
0
  def make_splits(self, stamp_token, next_stamp_token, class_id):
    """Create the best split using the accumulated stats and flush the state."""
    # Get the bucket boundaries
    are_splits_ready, buckets = (
        self._quantile_accumulator.get_buckets(stamp_token))

    # After we receive the boundaries from previous iteration we can flush
    # the quantile accumulator.
    with ops.control_dependencies([buckets]):
      flush_quantiles = self._quantile_accumulator.flush(
          stamp_token=stamp_token, next_stamp_token=next_stamp_token)

    with ops.device(None):
      with ops.device(self._stats_accumulator.resource().device):
        num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
            self._stats_accumulator.flush(stamp_token, next_stamp_token))

        # Put quantile and stats accumulator flushing in the dependency path.
        are_splits_ready = control_flow_ops.with_dependencies(
            [flush_quantiles, partition_ids], are_splits_ready)
        partition_ids, gains, split_infos = (
            split_handler_ops.build_sparse_inequality_splits(
                num_minibatches=num_minibatches,
                bucket_boundaries=buckets,
                partition_ids=partition_ids,
                bucket_ids=bucket_ids,
                gradients=gradients,
                hessians=hessians,
                class_id=class_id,
                feature_column_group_id=self._feature_column_group_id,
                l1_regularization=self._l1_regularization,
                l2_regularization=self._l2_regularization,
                tree_complexity_regularization=self.
                _tree_complexity_regularization,
                min_node_weight=self._min_node_weight,
                bias_feature_id=_BIAS_FEATURE_ID,
                multiclass_strategy=self._multiclass_strategy))
    return (are_splits_ready, partition_ids, gains, split_infos)
Ejemplo n.º 11
0
    def testMakeSparseMultidimensionalSplit(self):
        """Tests split handler op."""
        with self.test_session() as sess:
            # Num of steps is 2.
            # The feature column is three dimensional.
            # First dimension has bias bucket only, the second has bias bucket and
            # two valid buckets, the third has just one bias bucket and one valid
            # bucket.
            # Gradients    | Partition | Dimension | bucket ID       |
            # (0.9, 0.39)  |    0      |     0     |     -1          |
            # (1.2, 0.2)   |    0      |     1     |      0          |
            # (0.2, 0.12)  |    0      |     1     |      2          |
            # (0.1, 0.1)   |    0      |     2     |      3          |
            # Now second node - nothing interesting there, just one dimension.
            # Second node has the same bucket ids for all dimensions.
            # (4.0, 0.13)  |    1      |     0     |     -1          |
            # (4.0, 0.13)  |    1      |     2     |      3          |

            # Tree node ids.
            partition_ids = array_ops.constant([0, 0, 0, 0, 1, 1],
                                               dtype=dtypes.int32)

            dimension_ids = array_ops.constant([0, 1, 1, 2, 0, 2],
                                               dtype=dtypes.int64)
            bucket_ids = array_ops.constant([-1, 0, 2, 3, -1, 3],
                                            dtype=dtypes.int64)
            bucket_ids = array_ops.stack([bucket_ids, dimension_ids], axis=1)

            gradients = array_ops.constant([1.8, 2.4, 0.4, 0.2, 8.0, 8.0])
            hessians = array_ops.constant([0.78, 0.4, 0.24, 0.2, 0.26, 0.26])
            bucket_boundaries = array_ops.constant([0.3, 0.52, 0.58, 0.6])
            partitions, gains, splits = (
                split_handler_ops.build_sparse_inequality_splits(
                    num_minibatches=2,
                    partition_ids=partition_ids,
                    bucket_ids=bucket_ids,
                    gradients=gradients,
                    hessians=hessians,
                    bucket_boundaries=bucket_boundaries,
                    l1_regularization=0,
                    l2_regularization=2,
                    tree_complexity_regularization=0,
                    min_node_weight=0,
                    feature_column_group_id=0,
                    bias_feature_id=-1,
                    class_id=-1,
                    multiclass_strategy=learner_pb2.LearnerConfig.
                    TREE_PER_CLASS))
            partitions, gains, splits = (sess.run([partitions, gains, splits]))
        self.assertAllEqual([0, 1], partitions)
        self.assertEqual(2, len(splits))
        # Check the split on node 0 - it should split on second dimension
        # -(0.2 + 1.2) / (0.12 + 0.2 + 2)
        expected_left_weight = -0.603448275862069
        # (0.2 + 1.2) ** 2 / (0.12 + 0.2 + 2)
        expected_left_gain = 0.8448275862068965
        # 0.5 / (0.07 + 2)
        expected_right_weight = 0.24154589371980678
        # 0.5 ** 2 / (0.07 + 2)
        expected_right_gain = 0.12077294685990339
        # (0.2 + 1.2 - 0.5) ** 2 /  (0.12 + 0.2 + 0.07 + 2)
        expected_bias_gain = 0.3389121338912133

        split_info = split_info_pb2.SplitInfo()
        split_info.ParseFromString(splits[0])
        left_child = split_info.left_child.vector
        right_child = split_info.right_child.vector
        split_node = split_info.split_node.sparse_float_binary_split_default_right
        self.assertAllClose(
            expected_left_gain + expected_right_gain - expected_bias_gain,
            gains[0])

        self.assertAllClose([expected_left_weight], left_child.value)

        self.assertAllClose([expected_right_weight], right_child.value)

        self.assertEqual(0, split_node.split.feature_column)
        # Split happened on second dimension.
        self.assertEqual(1, split_node.split.dimension_id)

        self.assertAllClose(0.58, split_node.split.threshold)

        # Check the split on partition 1.
        expected_left_weight = -1.8779342723004695
        expected_right_weight = 0

        # Verify candidate for partition 1, there's only one active bucket here
        # so zero gain is expected.
        split_info.ParseFromString(splits[1])
        left_child = split_info.left_child.vector
        right_child = split_info.right_child.vector
        split_node = split_info.split_node.sparse_float_binary_split_default_left

        self.assertAllClose(0.0, gains[1])

        self.assertAllClose([expected_left_weight], left_child.value)

        self.assertAllClose([expected_right_weight], right_child.value)

        self.assertEqual(0, split_node.split.feature_column)
        self.assertEqual(2, split_node.split.dimension_id)

        self.assertAllClose(0.6, split_node.split.threshold)
Ejemplo n.º 12
0
    def testMakeSparseSplit(self):
        """Tests split handler op."""
        with self.test_session() as sess:
            # The data looks like the following after dividing by number of steps (2).
            # Gradients    | Partition | bucket ID       |
            # (0.9, 0.39)  | 0         | -1              |
            # (1.2, 0.2)   | 0         | 0               |
            # (0.2, 0.12)  | 0         | 1               |
            # (4.0, 0.13)  | 1         | -1              |
            # (4.0, 0.13)  | 1         | 1               |
            partition_ids = array_ops.constant([0, 0, 0, 1, 1],
                                               dtype=dtypes.int32)
            # We have only 1 dimension in our sparse feature column.
            bucket_ids = array_ops.constant([-1, 0, 1, -1, 1],
                                            dtype=dtypes.int64)
            dimension_ids = array_ops.constant([0, 0, 0, 0, 0],
                                               dtype=dtypes.int64)
            bucket_ids = array_ops.stack([bucket_ids, dimension_ids], axis=1)

            gradients = array_ops.constant([1.8, 2.4, 0.4, 8.0, 8.0])
            hessians = array_ops.constant([0.78, 0.4, 0.24, 0.26, 0.26])
            bucket_boundaries = array_ops.constant([0.3, 0.52])
            partitions, gains, splits = (
                split_handler_ops.build_sparse_inequality_splits(
                    num_minibatches=2,
                    partition_ids=partition_ids,
                    bucket_ids=bucket_ids,
                    gradients=gradients,
                    hessians=hessians,
                    bucket_boundaries=bucket_boundaries,
                    l1_regularization=0,
                    l2_regularization=2,
                    tree_complexity_regularization=0,
                    min_node_weight=0,
                    feature_column_group_id=0,
                    bias_feature_id=-1,
                    class_id=-1,
                    multiclass_strategy=learner_pb2.LearnerConfig.
                    TREE_PER_CLASS))
            partitions, gains, splits = (sess.run([partitions, gains, splits]))
        self.assertAllEqual([0, 1], partitions)
        self.assertEqual(2, len(splits))
        # Check the split on partition 0.
        # -(0.2 + 1.2) / (0.12 + 0.2 + 2)
        expected_left_weight = -0.603448275862069
        # (0.2 + 1.2) ** 2 / (0.12 + 0.2 + 2)
        expected_left_gain = 0.8448275862068965
        # 0.5 / (0.07 + 2)
        expected_right_weight = 0.24154589371980678
        # 0.5 ** 2 / (0.07 + 2)
        expected_right_gain = 0.12077294685990339
        # (0.2 + 1.2 - 0.5) ** 2 /  (0.12 + 0.2 + 0.07 + 2)
        expected_bias_gain = 0.3389121338912133

        split_info = split_info_pb2.SplitInfo()
        split_info.ParseFromString(splits[0])
        left_child = split_info.left_child.vector
        right_child = split_info.right_child.vector
        split_node = split_info.split_node.sparse_float_binary_split_default_right
        self.assertAllClose(
            expected_left_gain + expected_right_gain - expected_bias_gain,
            gains[0])

        self.assertAllClose([expected_left_weight], left_child.value)

        self.assertAllClose([expected_right_weight], right_child.value)

        self.assertEqual(0, split_node.split.feature_column)
        # Sparse is one dimensional.
        self.assertEqual(0, split_node.split.dimension_id)

        self.assertAllClose(0.52, split_node.split.threshold)

        # Check the split on partition 1.
        expected_left_weight = -1.8779342723004695
        expected_right_weight = 0

        # Verify candidate for partition 1, there's only one active bucket here
        # so zero gain is expected.
        split_info.ParseFromString(splits[1])
        left_child = split_info.left_child.vector
        right_child = split_info.right_child.vector
        split_node = split_info.split_node.sparse_float_binary_split_default_left

        self.assertAllClose(0.0, gains[1])

        self.assertAllClose([expected_left_weight], left_child.value)

        self.assertAllClose([expected_right_weight], right_child.value)

        self.assertEqual(0, split_node.split.feature_column)
        # Sparse is one dimensional.
        self.assertEqual(0, split_node.split.dimension_id)

        self.assertAllClose(0.52, split_node.split.threshold)
  def testMakeSparseMultidimensionalSplit(self):
    """Tests split handler op."""
    with self.test_session() as sess:
      # Num of steps is 2.
      # The feature column is three dimensional.
      # First dimension has bias bucket only, the second has bias bucket and
      # two valid buckets, the third has just one bias bucket and one valid
      # bucket.
      # Gradients    | Partition | Dimension | bucket ID       |
      # (0.9, 0.39)  |    0      |     0     |     -1          |
      # (1.2, 0.2)   |    0      |     1     |      0          |
      # (0.2, 0.12)  |    0      |     1     |      2          |
      # (0.1, 0.1)   |    0      |     2     |      3          |
      # Now second node - nothing interesting there, just one dimension.
      # Second node has the same bucket ids for all dimensions.
      # (4.0, 0.13)  |    1      |     0     |     -1          |
      # (4.0, 0.13)  |    1      |     2     |      3          |

      # Tree node ids.
      partition_ids = array_ops.constant([0, 0, 0, 0, 1, 1], dtype=dtypes.int32)

      dimension_ids = array_ops.constant([0, 1, 1, 2, 0, 2], dtype=dtypes.int64)
      bucket_ids = array_ops.constant([-1, 0, 2, 3, -1, 3], dtype=dtypes.int64)
      bucket_ids = array_ops.stack([bucket_ids, dimension_ids], axis=1)

      gradients = array_ops.constant([1.8, 2.4, 0.4, 0.2, 8.0, 8.0])
      hessians = array_ops.constant([0.78, 0.4, 0.24, 0.2, 0.26, 0.26])
      bucket_boundaries = array_ops.constant([0.3, 0.52, 0.58, 0.6])
      partitions, gains, splits = (
          split_handler_ops.build_sparse_inequality_splits(
              num_minibatches=2,
              partition_ids=partition_ids,
              bucket_ids=bucket_ids,
              gradients=gradients,
              hessians=hessians,
              bucket_boundaries=bucket_boundaries,
              l1_regularization=0,
              l2_regularization=2,
              tree_complexity_regularization=0,
              min_node_weight=0,
              feature_column_group_id=0,
              bias_feature_id=-1,
              class_id=-1,
              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
      partitions, gains, splits = (sess.run([partitions, gains, splits]))
    self.assertAllEqual([0, 1], partitions)
    self.assertEqual(2, len(splits))
    # Check the split on node 0 - it should split on second dimension
    # -(0.2 + 1.2) / (0.12 + 0.2 + 2)
    expected_left_weight = -0.603448275862069
    # (0.2 + 1.2) ** 2 / (0.12 + 0.2 + 2)
    expected_left_gain = 0.8448275862068965
    # 0.5 / (0.07 + 2)
    expected_right_weight = 0.24154589371980678
    # 0.5 ** 2 / (0.07 + 2)
    expected_right_gain = 0.12077294685990339
    # (0.2 + 1.2 - 0.5) ** 2 /  (0.12 + 0.2 + 0.07 + 2)
    expected_bias_gain = 0.3389121338912133

    split_info = split_info_pb2.SplitInfo()
    split_info.ParseFromString(splits[0])
    left_child = split_info.left_child.vector
    right_child = split_info.right_child.vector
    split_node = split_info.split_node.sparse_float_binary_split_default_right
    self.assertAllClose(
        expected_left_gain + expected_right_gain - expected_bias_gain, gains[0])

    self.assertAllClose([expected_left_weight], left_child.value)

    self.assertAllClose([expected_right_weight], right_child.value)

    self.assertEqual(0, split_node.split.feature_column)
    # Split happened on second dimension.
    self.assertEqual(1, split_node.split.dimension_id)

    self.assertAllClose(0.58, split_node.split.threshold)

    # Check the split on partition 1.
    expected_left_weight = -1.8779342723004695
    expected_right_weight = 0

    # Verify candidate for partition 1, there's only one active bucket here
    # so zero gain is expected.
    split_info.ParseFromString(splits[1])
    left_child = split_info.left_child.vector
    right_child = split_info.right_child.vector
    split_node = split_info.split_node.sparse_float_binary_split_default_left

    self.assertAllClose(0.0, gains[1])

    self.assertAllClose([expected_left_weight], left_child.value)

    self.assertAllClose([expected_right_weight], right_child.value)

    self.assertEqual(0, split_node.split.feature_column)
    self.assertEqual(2, split_node.split.dimension_id)

    self.assertAllClose(0.6, split_node.split.threshold)
  def testMakeSparseSplit(self):
    """Tests split handler op."""
    with self.test_session() as sess:
      # The data looks like the following after dividing by number of steps (2).
      # Gradients    | Partition | bucket ID       |
      # (0.9, 0.39)  | 0         | -1              |
      # (1.2, 0.2)   | 0         | 0               |
      # (0.2, 0.12)  | 0         | 1               |
      # (4.0, 0.13)  | 1         | -1              |
      # (4.0, 0.13)  | 1         | 1               |
      partition_ids = array_ops.constant([0, 0, 0, 1, 1], dtype=dtypes.int32)
      # We have only 1 dimension in our sparse feature column.
      bucket_ids = array_ops.constant([-1, 0, 1, -1, 1], dtype=dtypes.int64)
      dimension_ids = array_ops.constant([0, 0, 0, 0, 0], dtype=dtypes.int64)
      bucket_ids = array_ops.stack([bucket_ids, dimension_ids], axis=1)

      gradients = array_ops.constant([1.8, 2.4, 0.4, 8.0, 8.0])
      hessians = array_ops.constant([0.78, 0.4, 0.24, 0.26, 0.26])
      bucket_boundaries = array_ops.constant([0.3, 0.52])
      partitions, gains, splits = (
          split_handler_ops.build_sparse_inequality_splits(
              num_minibatches=2,
              partition_ids=partition_ids,
              bucket_ids=bucket_ids,
              gradients=gradients,
              hessians=hessians,
              bucket_boundaries=bucket_boundaries,
              l1_regularization=0,
              l2_regularization=2,
              tree_complexity_regularization=0,
              min_node_weight=0,
              feature_column_group_id=0,
              bias_feature_id=-1,
              class_id=-1,
              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
      partitions, gains, splits = (sess.run([partitions, gains, splits]))
    self.assertAllEqual([0, 1], partitions)
    self.assertEqual(2, len(splits))
    # Check the split on partition 0.
    # -(0.2 + 1.2) / (0.12 + 0.2 + 2)
    expected_left_weight = -0.603448275862069
    # (0.2 + 1.2) ** 2 / (0.12 + 0.2 + 2)
    expected_left_gain = 0.8448275862068965
    # 0.5 / (0.07 + 2)
    expected_right_weight = 0.24154589371980678
    # 0.5 ** 2 / (0.07 + 2)
    expected_right_gain = 0.12077294685990339
    # (0.2 + 1.2 - 0.5) ** 2 /  (0.12 + 0.2 + 0.07 + 2)
    expected_bias_gain = 0.3389121338912133

    split_info = split_info_pb2.SplitInfo()
    split_info.ParseFromString(splits[0])
    left_child = split_info.left_child.vector
    right_child = split_info.right_child.vector
    split_node = split_info.split_node.sparse_float_binary_split_default_right
    self.assertAllClose(
        expected_left_gain + expected_right_gain - expected_bias_gain, gains[0])

    self.assertAllClose([expected_left_weight], left_child.value)

    self.assertAllClose([expected_right_weight], right_child.value)

    self.assertEqual(0, split_node.split.feature_column)
    # Sparse is one dimensional.
    self.assertEqual(0, split_node.split.dimension_id)

    self.assertAllClose(0.52, split_node.split.threshold)

    # Check the split on partition 1.
    expected_left_weight = -1.8779342723004695
    expected_right_weight = 0

    # Verify candidate for partition 1, there's only one active bucket here
    # so zero gain is expected.
    split_info.ParseFromString(splits[1])
    left_child = split_info.left_child.vector
    right_child = split_info.right_child.vector
    split_node = split_info.split_node.sparse_float_binary_split_default_left

    self.assertAllClose(0.0, gains[1])

    self.assertAllClose([expected_left_weight], left_child.value)

    self.assertAllClose([expected_right_weight], right_child.value)

    self.assertEqual(0, split_node.split.feature_column)
    # Sparse is one dimensional.
    self.assertEqual(0, split_node.split.dimension_id)

    self.assertAllClose(0.52, split_node.split.threshold)