Example #1
0
    def testSaveRestoreBeforeFlush(self):
        save_dir = os.path.join(self.get_temp_dir(), "save_restore")
        save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")

        with self.test_session(graph=ops.Graph()) as sess:
            accumulator = quantile_ops.QuantileAccumulator(init_stamp_token=0,
                                                           num_quantiles=3,
                                                           epsilon=0.33,
                                                           name="q0")

            save = saver.Saver()
            resources.initialize_resources(resources.shared_resources()).run()

            sparse_indices_0 = constant_op.constant(
                [[1, 0], [2, 1], [3, 0], [4, 2], [5, 0]], dtype=dtypes.int64)
            sparse_values_0 = constant_op.constant([2.0, 3.0, 4.0, 5.0, 6.0],
                                                   dtype=dtypes.float32)
            sparse_shape_0 = constant_op.constant([6, 3], dtype=dtypes.int64)
            example_weights = constant_op.constant([10, 1, 1, 1, 1, 1],
                                                   dtype=dtypes.float32,
                                                   shape=[6, 1])
            update = accumulator.add_summary(stamp_token=0,
                                             column=sparse_tensor.SparseTensor(
                                                 sparse_indices_0,
                                                 sparse_values_0,
                                                 sparse_shape_0),
                                             example_weights=example_weights)
            update.run()
            save.save(sess, save_path)
            reset = accumulator.flush(stamp_token=0, next_stamp_token=1)
            with ops.control_dependencies([reset]):
                are_ready_flush, buckets = (accumulator.get_buckets(
                    stamp_token=1))
            buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
            self.assertEqual(True, are_ready_flush)
            self.assertAllEqual([2, 4, 6.], buckets)

        with self.test_session(graph=ops.Graph()) as sess:
            accumulator = quantile_ops.QuantileAccumulator(init_stamp_token=0,
                                                           num_quantiles=3,
                                                           epsilon=0.33,
                                                           name="q0")
            save = saver.Saver()

            # Restore the saved values in the parameter nodes.
            save.restore(sess, save_path)
            are_ready_noflush = accumulator.get_buckets(stamp_token=0)[0]
            with ops.control_dependencies([are_ready_noflush]):
                reset = accumulator.flush(stamp_token=0, next_stamp_token=1)

            with ops.control_dependencies([reset]):
                are_ready_flush, buckets = accumulator.get_buckets(
                    stamp_token=1)
            buckets, are_ready_flush, are_ready_noflush = (sess.run(
                [buckets, are_ready_flush, are_ready_noflush]))
            self.assertFalse(are_ready_noflush)
            self.assertTrue(are_ready_flush)
            self.assertAllEqual([2, 4, 6.], buckets)
  def testStreamingQuantileBuckets(self):
    """Sets up the quantile summary op test as follows.

    100 batches of data is added to the accumulator. The batches are in form:
    [0 1 .. 99]
    [100 101 .. 200]
    ...
    [9900 9901 .. 9999]
    All the batches have 1 for all the example weights.
    """
    with self.test_session() as sess:
      accumulator = quantile_ops.QuantileAccumulator(
          init_stamp_token=0, num_quantiles=3, epsilon=0.01, name="q1")
      resources.initialize_resources(resources.shared_resources()).run()
    weight_placeholder = array_ops.placeholder(dtypes.float32)
    dense_placeholder = array_ops.placeholder(dtypes.float32)
    update = accumulator.add_summary(
        stamp_token=0,
        column=dense_placeholder,
        example_weights=weight_placeholder)
    with self.test_session() as sess:
      for i in range(100):
        dense_float = np.linspace(
            i * 100, (i + 1) * 100 - 1, num=100).reshape(-1, 1)
        sess.run(update, {
            dense_placeholder: dense_float,
            weight_placeholder: np.ones(shape=(100, 1), dtype=np.float32)
        })

    with self.test_session() as sess:
      sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
      are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
      buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
      self.assertEqual(True, are_ready_flush)
      self.assertAllEqual([0, 3335., 6671., 9999.], buckets)
  def _testStreamingQuantileBucketsHelper(
      self, inputs, num_quantiles=3, expected_buckets=None):
    """Helper to test quantile buckets on different inputs."""

    # set generate_quantiles to True since the test will generate fewer
    # boundaries otherwise.
    with self.test_session() as sess:
      accumulator = quantile_ops.QuantileAccumulator(
          init_stamp_token=0, num_quantiles=num_quantiles,
          epsilon=0.001, name="q1", generate_quantiles=True)
      resources.initialize_resources(resources.shared_resources()).run()
    input_column = array_ops.placeholder(dtypes.float32)
    weights = array_ops.placeholder(dtypes.float32)
    update = accumulator.add_summary(
        stamp_token=0,
        column=input_column,
        example_weights=weights)

    with self.test_session() as sess:
      sess.run(update,
               {input_column: inputs,
                weights: [1] * len(inputs)})

    with self.test_session() as sess:
      sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
      are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
      buckets, are_ready_flush = (sess.run(
          [buckets, are_ready_flush]))
      self.assertEqual(True, are_ready_flush)
      # By default, use 3 quantiles, 4 boundaries for simplicity.
      self.assertEqual(num_quantiles + 1, len(buckets))
      if expected_buckets:
        self.assertAllEqual(buckets, expected_buckets)
Example #4
0
    def __init__(self, num_quantiles, epsilon, serialized_tf_config=None):
        self._num_quantiles = num_quantiles
        self._epsilon = epsilon
        self._serialized_tf_config = serialized_tf_config

        # _stamp_token is used to commit the state of the qaccumulator. In
        # this case, the qaccumulator state is completely returned and stored
        # as part of quantile_state/summary in the combiner fn (i.e the summary is
        # extracted and stored outside the qaccumulator). So we don't use
        # the timestamp mechanism to signify progress in the qaccumulator state.
        self._stamp_token = 0
        # Represents an empty summary. This could be changed to a tf.constant
        # implemented by the quantile ops library.
        self._empty_summary = None

        # Create a new session with a new graph for quantile ops.
        self._session = tf.Session(
            graph=tf.Graph(),
            config=_maybe_deserialize_tf_config(serialized_tf_config))
        with self._session.graph.as_default():
            with self._session.as_default():
                self._qaccumulator = quantile_ops.QuantileAccumulator(
                    init_stamp_token=self._stamp_token,
                    num_quantiles=self._num_quantiles,
                    epsilon=self._epsilon,
                    name='qaccumulator')
                resources.initialize_resources(
                    resources.shared_resources()).run()
Example #5
0
    def initialize_local_state(self, tf_config=None):
        """Called by the CombineFnWrapper's __init__ method.

    This can be used to set non-pickleable local state.  It is used in
    conjunction with overriding __reduce__ so this state is not pickled.  This
    method must be called prior to any other method.

    Args:
      tf_config: (optional) A tf.ConfigProto
    """
        # _stamp_token is used to commit the state of the qaccumulator. In
        # this case, the qaccumulator state is completely returned and stored
        # as part of quantile_state/summary in the combiner fn (i.e the summary is
        # extracted and stored outside the qaccumulator). So we don't use
        # the timestamp mechanism to signify progress in the qaccumulator state.
        self._stamp_token = 0
        # Represents an empty summary. This could be changed to a tf.constant
        # implemented by the quantile ops library.
        self._empty_summary = None

        # Create a new session with a new graph for quantile ops.
        self._session = tf.Session(graph=tf.Graph(), config=tf_config)
        with self._session.graph.as_default():
            with self._session.as_default():
                self._qaccumulator = quantile_ops.QuantileAccumulator(
                    init_stamp_token=self._stamp_token,
                    num_quantiles=self._num_quantiles,
                    epsilon=self._epsilon,
                    name='qaccumulator')
                resources.initialize_resources(
                    resources.shared_resources()).run()
Example #6
0
  def __init__(self,
               l1_regularization,
               l2_regularization,
               tree_complexity_regularization,
               min_node_weight,
               feature_column_group_id,
               epsilon,
               num_quantiles,
               gradient_shape,
               hessian_shape,
               multiclass_strategy,
               init_stamp_token=0,
               loss_uses_sum_reduction=False,
               name=None):
    """Initialize the internal state for this split handler.

    Args:
      l1_regularization: L1 regularization applied for this split handler.
      l2_regularization: L2 regularization applied for this split handler.
      tree_complexity_regularization: Tree complexity regularization applied
          for this split handler.
      min_node_weight: Minimum sum of weights of examples in each partition to
          be considered for splitting.
      feature_column_group_id: Feature column group index.
      epsilon: A float, the error bound for quantile computation.
      num_quantiles: An int, the number of buckets to create from the histogram.
      gradient_shape: A TensorShape, containing shape of gradients.
      hessian_shape: A TensorShape, containing shape of hessians.
      multiclass_strategy: Strategy describing how to treat multiclass problems.
      init_stamp_token: A tensor containing an scalar for initial stamp of the
         stamped objects.
      loss_uses_sum_reduction: A scalar boolean tensor that specifies whether
          SUM or MEAN reduction was used for the loss.
      name: An optional handler name.
    """
    super(InequalitySplitHandler, self).__init__(
        name=name,
        l1_regularization=l1_regularization,
        l2_regularization=l2_regularization,
        tree_complexity_regularization=tree_complexity_regularization,
        min_node_weight=min_node_weight,
        feature_column_group_id=feature_column_group_id,
        gradient_shape=gradient_shape,
        hessian_shape=hessian_shape,
        multiclass_strategy=multiclass_strategy,
        loss_uses_sum_reduction=loss_uses_sum_reduction)
    self._stats_accumulator = stats_accumulator_ops.StatsAccumulator(
        init_stamp_token,
        gradient_shape,
        hessian_shape,
        name="StatsAccumulator/{}".format(self._name))
    # Allocate both stats accumulator and quantile accumulator on the same
    # device so that we can build splits with fewer RPCs.
    with ops.colocate_with(self._stats_accumulator.resource()):
      self._quantile_accumulator = quantile_ops.QuantileAccumulator(
          init_stamp_token,
          epsilon=epsilon,
          num_quantiles=num_quantiles,
          name="QuantileAccumulator/{}".format(self._name))
Example #7
0
    def testSaveRestoreAfterFlush(self):
        save_dir = os.path.join(self.get_temp_dir(), "save_restore")
        save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")

        with self.test_session(graph=ops.Graph()) as sess:
            accumulator = quantile_ops.QuantileAccumulator(init_stamp_token=0,
                                                           num_quantiles=3,
                                                           epsilon=0.33,
                                                           name="q0")

            save = saver.Saver()
            resources.initialize_resources(resources.shared_resources()).run()

            example_weights = constant_op.constant([10, 1, 1, 1, 1, 1],
                                                   dtype=dtypes.float32,
                                                   shape=[6, 1])
            dense_float_tensor_0 = constant_op.constant([1, 2, 3, 4, 4, 5],
                                                        dtype=dtypes.float32,
                                                        shape=[6, 1])
            update = accumulator.add_summary(stamp_token=0,
                                             column=dense_float_tensor_0,
                                             example_weights=example_weights)
            update.run()
            reset = accumulator.flush(stamp_token=0, next_stamp_token=1)
            with ops.control_dependencies([reset]):
                are_ready_flush, buckets = (accumulator.get_buckets(
                    stamp_token=1))
            buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
            self.assertEqual(True, are_ready_flush)
            self.assertAllEqual([1, 3, 5], buckets)
            save.save(sess, save_path)

        with self.test_session(graph=ops.Graph()) as sess:
            accumulator = quantile_ops.QuantileAccumulator(init_stamp_token=0,
                                                           num_quantiles=3,
                                                           epsilon=0.33,
                                                           name="q0")
            save = saver.Saver()

            # Restore the saved values in the parameter nodes.
            save.restore(sess, save_path)
            are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
            buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
            self.assertEqual(True, are_ready_flush)
            self.assertAllEqual([1, 3, 5], buckets)
Example #8
0
    def testStreamingQuantileBucketsLowPrecisionInput(self):
        """Tests inputs that simulate low precision float16 values."""

        num_quantiles = 3
        # set generate_quantiles to True since the test will generate fewer
        # boundaries otherwise.
        with self.test_session() as sess:
            accumulator = quantile_ops.QuantileAccumulator(
                init_stamp_token=0,
                num_quantiles=num_quantiles,
                epsilon=0.001,
                name="q1",
                generate_quantiles=True)
            resources.initialize_resources(resources.shared_resources()).run()
        input_column = array_ops.placeholder(dtypes.float32)
        weights = array_ops.placeholder(dtypes.float32)
        update = accumulator.add_summary(stamp_token=0,
                                         column=input_column,
                                         example_weights=weights)

        with self.test_session() as sess:
            # This input is generated by integer in the range [2030, 2060]
            # but represented by with float16 precision. Integers <= 2048 are
            # exactly represented, whereas  numbers > 2048 are rounded; and hence
            # numbers > 2048 are repeated. For precision loss / rounding, see:
            # https://en.wikipedia.org/wiki/Half-precision_floating-point_format.
            #
            # The intent of the test is not handling of float16 values, but to
            # validate the number of buckets is returned, in cases where  the input
            # may contain repeated values.
            inputs = [
                2030.0, 2031.0, 2032.0, 2033.0, 2034.0, 2035.0, 2036.0, 2037.0,
                2038.0, 2039.0, 2040.0, 2041.0, 2042.0, 2043.0, 2044.0, 2045.0,
                2046.0, 2047.0, 2048.0, 2048.0, 2050.0, 2052.0, 2052.0, 2052.0,
                2054.0, 2056.0, 2056.0, 2056.0, 2058.0, 2060.0
            ]
            sess.run(update, {
                input_column: inputs,
                weights: [1] * len(inputs)
            })

        with self.test_session() as sess:
            sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
            are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
            buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
            self.assertEqual(True, are_ready_flush)
            self.assertEqual(num_quantiles + 1, len(buckets))
            self.assertAllEqual([2030, 2040, 2050, 2060], buckets)
Example #9
0
    def testStreamingQuantileBucketsWithVaryingBatch(self):
        """Sets up the quantile summary op test as follows.

    Creates batches examples with different number of inputs in each batch.
    The input values are dense in the range [1 ... N]
    The data looks like this:
    | Batch | Start | InputList
    |   1   |   1   |  [1]
    |   2   |   2   |  [2, 3]
    |   3   |   4   |  [4, 5, 6]
    |   4   |   7   |  [7, 8, 9, 10]
    |   5   |  11   |  [11, 12, 13, 14, 15]
    |   6   |  16   |  [16, 17, 18, 19, 20, 21]
    """

        num_quantiles = 3
        with self.test_session() as sess:
            accumulator = quantile_ops.QuantileAccumulator(
                init_stamp_token=0,
                num_quantiles=num_quantiles,
                epsilon=0.001,
                name="q1")
            resources.initialize_resources(resources.shared_resources()).run()
        input_column = array_ops.placeholder(dtypes.float32)
        weights = array_ops.placeholder(dtypes.float32)
        update = accumulator.add_summary(stamp_token=0,
                                         column=input_column,
                                         example_weights=weights)

        with self.test_session() as sess:
            for i in range(1, 23):
                # start = 1, 2, 4, 7, 11, 16 ... (see comment above)
                start = int((i * (i - 1) / 2) + 1)
                sess.run(update, {
                    input_column: range(start, start + i),
                    weights: [1] * i
                })

        with self.test_session() as sess:
            sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
            are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
            buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
            self.assertEqual(True, are_ready_flush)
            self.assertEqual(num_quantiles + 1, len(buckets))
            self.assertAllEqual([1, 86., 170., 253.], buckets)
Example #10
0
    def testStreamingQuantileBuckets(self):
        """Sets up the quantile summary op test as follows.

    Create a batch of 6 examples having a dense and sparse features.
    The data looks like this
    | Instance | instance weights | Dense 0
    | 0        |     10           |   1
    | 1        |     1            |   2
    | 2        |     1            |   3
    | 3        |     1            |   4
    | 4        |     1            |   4
    | 5        |     1            |   5
    """
        dense_float_tensor_0 = np.array([1, 2, 3, 4, 4, 5])
        example_weights = np.array([10, 1, 1, 1, 1, 1])

        with self.test_session() as sess:
            accumulator = quantile_ops.QuantileAccumulator(init_stamp_token=0,
                                                           num_quantiles=3,
                                                           epsilon=0.33,
                                                           name="q1")

            resources.initialize_resources(resources.shared_resources()).run()

            are_ready_noflush, _, = (accumulator.get_buckets(stamp_token=0))

            update = accumulator.add_summary(stamp_token=0,
                                             column=dense_float_tensor_0,
                                             example_weights=example_weights)
            with ops.control_dependencies([are_ready_noflush, update]):
                reset = accumulator.flush(stamp_token=0, next_stamp_token=1)
            with ops.control_dependencies([reset]):
                are_ready_flush, buckets = (accumulator.get_buckets(
                    stamp_token=1))
            buckets, are_ready_noflush, are_ready_flush = (sess.run(
                [buckets, are_ready_noflush, are_ready_flush]))
            self.assertEqual(False, are_ready_noflush)
            self.assertEqual(True, are_ready_flush)
            self.assertAllEqual([1, 3, 5], buckets)
Example #11
0
  def initialize_local_state(self, tf_config=None):
    """Called by the CombineFnWrapper's __init__ method.

    This can be used to set non-pickleable local state.  It is used in
    conjunction with overriding __reduce__ so this state is not pickled.  This
    method must be called prior to any other method.

    Args:
      tf_config: (optional) A tf.ConfigProto
    """
    # stamp_token is used to commit the state of the qaccumulator. In
    # this case, the qaccumulator state is completely returned and stored
    # as part of quantile_state/summary in the combiner fn (i.e the summary is
    # extracted and stored outside the qaccumulator). So we don't use
    # the timestamp mechanism to signify progress in the qaccumulator state.
    stamp_token = 0

    # Create a new session with a new graph for quantile ops.
    self._session = tf.Session(graph=tf.Graph(), config=tf_config)
    with self._session.graph.as_default():
      with self._session.as_default():
        self._qaccumulator = quantile_ops.QuantileAccumulator(
            init_stamp_token=stamp_token,
            num_quantiles=self._num_quantiles,
            epsilon=self._epsilon,
            name='qaccumulator')
        resources.initialize_resources(resources.shared_resources()).run()

        # Create placeholder that will be used to provide input the
        # QuantileAccumulator.  Has shape (1, None) as this is what the
        # QuantileAccumulator accepts.
        self._add_summary_input = tf.placeholder(
            dtype=self._bucket_numpy_dtype, shape=[1, None])

        # Create op to update the accumulator with new input fed from
        # self._qaccumulator_input.
        self._add_summary_op = self._qaccumulator.add_summary(
            stamp_token=stamp_token,
            column=self._add_summary_input,
            # All weights are equal, and the weight vector is the
            # same length as the input.
            example_weights=tf.ones_like(self._add_summary_input))

        # Create op to add a prebuilt summary to the accumulator, and a
        # placeholder tensor to provide the input for this op.
        self._prebuilt_summary_input = tf.placeholder(
            dtype=tf.string, shape=[])
        self._add_prebuilt_summary_op = self._qaccumulator.add_prebuilt_summary(
            stamp_token=stamp_token,
            summary=self._prebuilt_summary_input)

        # Create op to flush summaries and return a summary representing the
        # summaries that were added the accumulator so far.
        self._flush_summary_op = self._qaccumulator.flush_summary(
            stamp_token=stamp_token,
            next_stamp_token=stamp_token)

        # Create ops to flush the accumulator and return approximate boundaries.
        self._flush_op = self._qaccumulator.flush(
            stamp_token=stamp_token,
            next_stamp_token=stamp_token)
        _, self._buckets_op = self._qaccumulator.get_buckets(
            stamp_token=stamp_token)

    # We generate an empty summary by calling self._flush_summary_op.
    # We cache this as some implementations may call create_accumulator for
    # every input, and it can be cached since it will always be the same and
    # immutable.
    self._empty_summary = self._session.run(self._flush_summary_op)