Exemple #1
0
def apply_buckets(x, bucket_boundaries, name=None):
  """Returns a bucketized column, with a bucket index assigned to each input.

  Args:
    x: A numeric input `Tensor` whose values should be mapped to buckets.
    bucket_boundaries: The bucket boundaries represented as a list.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` of the same shape as `x`, with each element in the
    returned tensor representing the bucketized value. Bucketized value is
    in the range [0, len(bucket_boundaries)].
  """
  with tf.name_scope(name, 'apply_buckets'):
    buckets = quantile_ops.bucketize_with_input_boundaries(
        x, boundaries=bucket_boundaries, name='assign_buckets')
    # Convert to int64 because int32 is not compatible with tf.Example parser.
    # See _TF_EXAMPLE_ALLOWED_TYPES in FixedColumnRepresentation()
    # in tf_metadata/dataset_schema.py
    result = tf.to_int64(buckets)

    # Attach the relevant metadata to result, so that the corresponding
    # output feature will have this metadata set.
    min_value = tf.constant(0, tf.int64)
    max_value = tf.shape(bucket_boundaries)[1]
    api.set_tensor_schema_overrides(result, min_value, max_value)

    return result
 def testBucketizeWithInputBoundaries2(self):
   with self.test_session():
     boundaries = constant_op.constant([3], dtype=dtypes.float32)
     buckets = quantile_ops.bucketize_with_input_boundaries(
         input=[1, 2, 3, 4, 5],
         boundaries=boundaries)
     self.assertAllEqual([0, 0, 1, 1, 1], buckets.eval())
Exemple #3
0
 def testBucketizeWithInputBoundaries3(self):
     with self.test_session():
         b = array_ops.placeholder(dtypes.float32)
         buckets = quantile_ops.bucketize_with_input_boundaries(
             input=[1, 2, 3, 4, 5], boundaries=b)
         self.assertAllEqual([0, 1, 1, 2, 2],
                             buckets.eval(feed_dict={b: [2, 4]}))
 def testBucketizeWithInputBoundaries3(self):
   with self.test_session():
     b = array_ops.placeholder(dtypes.float32)
     buckets = quantile_ops.bucketize_with_input_boundaries(
         input=[1, 2, 3, 4, 5],
         boundaries=b)
     self.assertAllEqual([0, 1, 1, 2, 2],
                         buckets.eval(feed_dict={b: [2, 4]}))
Exemple #5
0
def _apply_buckets_with_keys(x, key, key_vocab, bucket_boundaries, name=None):
    """Bucketize a Tensor or SparseTensor where boundaries depend on the index.

  Args:
    x: A 1-d Tensor or SparseTensor.
    key: A 1-d Tensor or SparseTensor with the same size as x.
    key_vocab: A vocab containing all keys.  Must be exhaustive, an
        out-of-vocab entry in `key` will cause a crash.
    bucket_boundaries: A rank-2 Tensor of shape (key_size, num_buckets)
    name: (Optional) A name for this operation.

  Returns:
    A tensor with the same shape as `x` and dtype tf.int64
  """
    with tf.name_scope(name, 'apply_buckets_with_keys'):
        x_values = x.values if isinstance(x, tf.SparseTensor) else x
        key_values = key.values if isinstance(key, tf.SparseTensor) else key

        x_values = tf.to_float(x_values)
        # Convert `key_values` to indices in key_vocab.  We must use apply_function
        # since this uses a Table.
        key_indices = _lookup_key(key_values, key_vocab)

        combined_boundaries, offsets = _combine_bucket_boundaries(
            bucket_boundaries)

        # Apply the per-key offsets to x, which produces offset buckets (where the
        # bucket offset is an integer offset).  Then remove this offset to get the
        # actual per-key buckets for x.
        offset_x = x_values + tf.gather(offsets, key_indices)
        offset_buckets = tf.to_int64(
            quantile_ops.bucketize_with_input_boundaries(
                offset_x, combined_boundaries))
        num_buckets = tf.to_int64(tf.shape(bucket_boundaries)[1])
        bucketized_values = tf.clip_by_value(
            offset_buckets - key_indices * num_buckets, 0, num_buckets)

        # Attach the relevant metadata to result, so that the corresponding
        # output feature will have this metadata set.
        min_value = tf.constant(0, tf.int64)
        max_value = num_buckets
        schema_inference.set_tensor_schema_override(bucketized_values,
                                                    min_value, max_value)

        if isinstance(x, tf.SparseTensor):
            result = tf.SparseTensor(x.indices, bucketized_values,
                                     x.dense_shape)
        else:
            result = bucketized_values

        return result
Exemple #6
0
def bucketize(x, num_buckets, epsilon=None, name=None):
    """Returns a bucketized column, with a bucket index assigned to each input.

  Args:
    x: A numeric input `Tensor` whose values should be mapped to buckets.
    num_buckets: Values in the input `x` are divided into approximately
      equal-sized buckets, where the number of buckets is num_buckets.
    epsilon: (Optional) Error tolerance, typically a small fraction close to
      zero. If a value is not specified by the caller, a suitable value is
      computed based on experimental results.  For `num_buckets` less than 100,
      the value of 0.01 is chosen to handle a dataset of up to ~1 trillion input
      data values.  If `num_buckets` is larger, then epsilon is set to
      (1/`num_buckets`) to enforce a stricter error tolerance, because more
      buckets will result in smaller range for each bucket, and so we want the
      the boundaries to be less fuzzy.
      See analyzers.quantiles() for details.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` of the same shape as `x`, with each element in the
    returned tensor representing the bucketized value. Bucketized value is
    in the range [0, num_buckets).

  Raises:
    ValueError: If value of num_buckets is not > 1.
  """
    with tf.name_scope(name, 'bucketize'):
        if not isinstance(num_buckets, int):
            raise TypeError('num_buckets must be an int, got %s',
                            type(num_buckets))

        if num_buckets < 1:
            raise ValueError('Invalid num_buckets %d', num_buckets)

        if epsilon is None:
            # See explanation in args documentation for epsilon.
            epsilon = min(1.0 / num_buckets, 0.01)

        bucket_boundaries = analyzers.quantiles(x, num_buckets, epsilon)
        buckets = quantile_ops.bucketize_with_input_boundaries(
            x, boundaries=bucket_boundaries, name='assign_buckets')

        # Convert to int64 because int32 is not compatible with tf.Example parser.
        # See _TF_EXAMPLE_ALLOWED_TYPES in FixedColumnRepresentation()
        # in tf_metadata/dataset_schema.py
        return tf.to_int64(buckets)
Exemple #7
0
def apply_buckets(x, bucket_boundaries, name=None):
  """Returns a bucketized column, with a bucket index assigned to each input.

  Args:
    x: A numeric input `Tensor` or `SparseTensor` whose values should be mapped
        to buckets.  For `SparseTensor`s, the non-missing values will be mapped
        to buckets and missing value left missing.
    bucket_boundaries: The bucket boundaries represented as a rank 2 `Tensor`.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` of the same shape as `x`, with each element in the
    returned tensor representing the bucketized value. Bucketized value is
    in the range [0, len(bucket_boundaries)].
  """
  tf.compat.v1.assert_rank(bucket_boundaries, 2)
  with tf.compat.v1.name_scope(name, 'apply_buckets'):
    x_values = x.values if isinstance(x, tf.SparseTensor) else x
    buckets = quantile_ops.bucketize_with_input_boundaries(
        x_values, boundaries=bucket_boundaries, name='assign_buckets')
    # Convert to int64 because int32 is not compatible with tf.Example parser.
    # See _TF_EXAMPLE_ALLOWED_TYPES in FixedColumnRepresentation()
    # in tf_metadata/dataset_schema.py
    bucketized_values = tf.cast(buckets, dtype=tf.int64)

    # Attach the relevant metadata to result, so that the corresponding
    # output feature will have this metadata set.
    min_value = tf.constant(0, tf.int64)
    max_value = tf.shape(input=bucket_boundaries)[1]
    schema_inference.set_tensor_schema_override(
        bucketized_values, min_value, max_value)

    if isinstance(x, tf.SparseTensor):
      result = tf.SparseTensor(x.indices, bucketized_values, x.dense_shape)
    else:
      result = bucketized_values

    return result
 def testBucketizeWithInputBoundaries(self):
   with self.test_session():
     buckets = quantile_ops.bucketize_with_input_boundaries(
         input=[1, 2, 3, 4, 5],
         boundaries=[3])
     self.assertAllEqual([0, 0, 1, 1, 1], buckets.eval())
Exemple #9
0
def apply_buckets_with_interpolation(x, bucket_boundaries, name=None):
  """Interpolates within the provided buckets and then normalizes to 0 to 1.

  A method for normalizing continuous numeric data to the range [0, 1].
  Numeric values are first bucketized according to the provided boundaries, then
  linearly interpolated within their respective bucket ranges. Finally, the
  interpolated values are normalized to the range [0, 1]. Values that are
  less than or equal to the lowest boundary, or greater than or equal to the
  highest boundary, will be mapped to 0 and 1 respectively.

  This is a non-linear approach to normalization that is less sensitive to
  outliers than min-max or z-score scaling. When outliers are present, standard
  forms of normalization can leave the majority of the data compressed into a
  very small segment of the output range, whereas this approach tends to spread
  out the more frequent values (if quantile buckets are used). Note that
  distance relationships in the raw data are not necessarily preserved (data
  points that close to each other in the raw feature space may not be equally
  close in the transformed feature space). This means that unlike linear
  normalization methods, correlations between features may be distorted by the
  transformation.

  Args:
    x: A numeric input `Tensor`/`SparseTensor` (tf.float[32|64], tf.int[32|64])
    bucket_boundaries: Sorted bucket boundaries as a rank-2 `Tensor`.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` or `SparseTensor` of the same shape as `x`, normalized to the
      range [0, 1]. If the input x is tf.float64, the returned values will be
      tf.float64. Otherwise, returned values are tf.float32.

  """
  with tf.name_scope(name, 'buckets_with_interpolation'):
    tf.assert_rank(bucket_boundaries, 2)
    x_values = x
    compose_result_fn = lambda values: values
    if isinstance(x, tf.SparseTensor):
      x_values = x.values
      compose_result_fn = (lambda values: tf.SparseTensor(  # pylint: disable=g-long-lambda
          indices=x.indices, values=values, dense_shape=x.dense_shape))
    if not check_ops.is_numeric_tensor(x_values):
      raise tf.errors.InvalidArgumentError(
          'Input tensor to be normalized must be numeric.')
    return_type = tf.float64 if x.dtype == tf.float64 else tf.float32
    num_boundaries = tf.to_int64(tf.shape(bucket_boundaries)[1])

    # The TF BucketizeWithInputBoundaries Op expects boundaries as tf.float32.
    bucket_boundaries = tf.cast(bucket_boundaries, tf.float32)
    bucket_indices = tf.cast(
        quantile_ops.bucketize_with_input_boundaries(
            x_values, boundaries=bucket_boundaries, name='assign_buckets'),
        tf.int64)

    # Get max, min, and width of the corresponding bucket for each element.
    bucket_max = tf.dtypes.cast(
        tf.gather(
            tf.concat([bucket_boundaries[0], bucket_boundaries[:, -1]], axis=0),
            bucket_indices), return_type)
    bucket_min = tf.dtypes.cast(
        tf.gather(
            tf.concat([bucket_boundaries[:, 0], bucket_boundaries[0]], axis=0),
            bucket_indices), return_type)
    bucket_width = bucket_max - bucket_min
    zeros = tf.zeros_like(x_values, dtype=return_type)
    ones = tf.ones_like(x_values, dtype=return_type)

    # Linearly interpolate each value within its respective bucket range.
    interpolation_value = (
        (tf.dtypes.cast(x_values, return_type) - bucket_min) / bucket_width)
    bucket_interpolation = tf.verify_tensor_all_finite(
        tf.where(
            # If bucket index is first or last, which represents "less than
            # min" and "greater than max" respectively, the bucket logically
            # has an infinite width and we can't meaningfully interpolate.
            tf.logical_or(
                tf.equal(bucket_indices, 0),
                tf.equal(bucket_indices, num_boundaries)),
            zeros,
            tf.where(
                # If the bucket width is zero due to numerical imprecision,
                # there is no point in interpolating
                tf.equal(bucket_width, 0.0),
                ones / 2.0,
                # Finally, for a bucket with a valid width, we can interpolate.
                interpolation_value)),
        'bucket_interpolation')
    bucket_indices_with_interpolation = tf.dtypes.cast(
        tf.maximum(bucket_indices - 1, 0), return_type) + bucket_interpolation

    # Normalize the interpolated values to the range [0, 1].
    denominator = tf.dtypes.cast(tf.maximum(num_boundaries - 1, 1), return_type)
    normalized_values = tf.div(bucket_indices_with_interpolation, denominator)
    # If there is only one boundary, all values < the boundary are 0, all values
    # >= the boundary are 1.
    single_boundary_values = lambda: tf.where(  # pylint: disable=g-long-lambda
        tf.equal(bucket_indices, 0), zeros, ones)
    normalized_result = tf.cond(
        tf.equal(num_boundaries, 1),
        single_boundary_values, lambda: normalized_values)
    return compose_result_fn(normalized_result)
Exemple #10
0
def apply_buckets_with_interpolation(x, bucket_boundaries, name=None):
  """Interpolates within the provided buckets and then normalizes to 0 to 1.

  A method for normalizing continuous numeric data to the range [0, 1].
  Numeric values are first bucketized according to the provided boundaries, then
  linearly interpolated within their respective bucket ranges. Finally, the
  interpolated values are normalized to the range [0, 1]. Values that are
  less than or equal to the lowest boundary, or greater than or equal to the
  highest boundary, will be mapped to 0 and 1 respectively.

  Args:
    x: A numeric input `Tensor` (tf.float32, tf.float64, tf.int32, tf.int64).
    bucket_boundaries: Sorted bucket boundaries as a rank-2 `Tensor`.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` of the same shape as `x`, normalized to the range [0, 1]. If the
      input x is tf.float64, the returned values will be tf.float64.
      Otherwise, returned values are tf.float32.

  """
  with tf.name_scope(name, 'buckets_with_interpolation'):
    tf.assert_rank(bucket_boundaries, 2)
    if not check_ops.is_numeric_tensor(x):
      raise tf.errors.InvalidArgumentError(
          'Input tensor to be normalized must be numeric.')
    return_type = tf.float64 if x.dtype == tf.float64 else tf.float32
    num_boundaries = tf.to_int64(tf.shape(bucket_boundaries)[1])

    # The TF BucketizeWithInputBoundaries Op expects boundaries as tf.float32.
    bucket_boundaries = tf.cast(bucket_boundaries, tf.float32)
    bucket_indices = tf.cast(
        quantile_ops.bucketize_with_input_boundaries(
            x, boundaries=bucket_boundaries, name='assign_buckets'), tf.int64)

    # Get max, min, and width of the corresponding bucket for each element.
    bucket_max = tf.dtypes.cast(
        tf.gather(
            tf.concat([bucket_boundaries[0], bucket_boundaries[:, -1]], axis=0),
            bucket_indices), return_type)
    bucket_min = tf.dtypes.cast(
        tf.gather(
            tf.concat([bucket_boundaries[:, 0], bucket_boundaries[0]], axis=0),
            bucket_indices), return_type)
    bucket_width = bucket_max - bucket_min
    zeros = tf.zeros_like(x, dtype=return_type)
    ones = tf.ones_like(x, dtype=return_type)

    # Linearly interpolate each value within its respective bucket range.
    interpolation_value = ((tf.dtypes.cast(x, return_type) - bucket_min) /
                           bucket_width)
    bucket_interpolation = tf.verify_tensor_all_finite(
        tf.where(
            # If bucket index is first or last, which represents "less than
            # min" and "greater than max" respectively, the bucket logically
            # has an infinite width and we can't meaningfully interpolate.
            tf.logical_or(
                tf.equal(bucket_indices, 0),
                tf.equal(bucket_indices, num_boundaries)),
            zeros,
            tf.where(
                # If the bucket width is zero due to numerical imprecision,
                # there is no point in interpolating
                tf.equal(bucket_width, 0.0),
                ones / 2.0,
                # Finally, for a bucket with a valid width, we can interpolate.
                interpolation_value)),
        'bucket_interpolation')
    bucket_indices_with_interpolation = tf.dtypes.cast(
        tf.maximum(bucket_indices - 1, 0), return_type) + bucket_interpolation

    # Normalize the interpolated values to the range [0, 1].
    denominator = tf.dtypes.cast(tf.maximum(num_boundaries - 1, 1), return_type)
    normalized_values = tf.div(bucket_indices_with_interpolation, denominator)
    # If there is only one boundary, all values < the boundary are 0, all values
    # >= the boundary are 1.
    single_boundary_values = lambda: tf.where(  # pylint: disable=g-long-lambda
        tf.equal(bucket_indices, 0), zeros, ones)
    return tf.cond(
        tf.equal(num_boundaries, 1),
        single_boundary_values, lambda: normalized_values)