Exemple #1
0
def bucketize(x, num_buckets, epsilon=None, weights=None, name=None):
    """Returns a bucketized column, with a bucket index assigned to each input.

  Args:
    x: A numeric input `Tensor` or `SparseTensor` whose values should be mapped
      to buckets.  For a `SparseTensor` only non-missing values will be included
      in the quantiles computation, and the result of `bucketize` will be a
      `SparseTensor` with non-missing values mapped to buckets.
    num_buckets: Values in the input `x` are divided into approximately
      equal-sized buckets, where the number of buckets is num_buckets.
      This is a hint. The actual number of buckets computed can be
      less or more than the requested number. Use the generated metadata to
      find the computed number of buckets.
    epsilon: (Optional) Error tolerance, typically a small fraction close to
      zero. If a value is not specified by the caller, a suitable value is
      computed based on experimental results.  For `num_buckets` less
      than 100, the value of 0.01 is chosen to handle a dataset of up to
      ~1 trillion input data values.  If `num_buckets` is larger,
      then epsilon is set to (1/`num_buckets`) to enforce a stricter
      error tolerance, because more buckets will result in smaller range for
      each bucket, and so we want the boundaries to be less fuzzy.
      See analyzers.quantiles() for details.
    weights: (Optional) Weights tensor for the quantiles. Tensor must have the
      same shape as x.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` of the same shape as `x`, with each element in the
    returned tensor representing the bucketized value. Bucketized value is
    in the range [0, actual_num_buckets). Sometimes the actual number of buckets
    can be different than num_buckets hint, for example in case the number of
    distinct values is smaller than num_buckets, or in cases where the
    input values are not uniformly distributed.

  Raises:
    ValueError: If value of num_buckets is not > 1.
  """
    with tf.name_scope(name, 'bucketize'):
        if not isinstance(num_buckets, int):
            raise TypeError('num_buckets must be an int, got %s' %
                            type(num_buckets))

        if num_buckets < 1:
            raise ValueError('Invalid num_buckets %d' % num_buckets)

        if epsilon is None:
            # See explanation in args documentation for epsilon.
            epsilon = min(1.0 / num_buckets, 0.01)

        x_values = x.values if isinstance(x, tf.SparseTensor) else x
        bucket_boundaries = analyzers.quantiles(x_values, num_buckets, epsilon,
                                                weights)
        return apply_buckets(x, bucket_boundaries)
Exemple #2
0
def bucketize(x, num_buckets, epsilon=None, name=None):
    """Returns a bucketized column, with a bucket index assigned to each input.

  Args:
    x: A numeric input `Tensor` whose values should be mapped to buckets.
    num_buckets: Values in the input `x` are divided into approximately
      equal-sized buckets, where the number of buckets is num_buckets.
    epsilon: (Optional) Error tolerance, typically a small fraction close to
      zero. If a value is not specified by the caller, a suitable value is
      computed based on experimental results.  For `num_buckets` less than 100,
      the value of 0.01 is chosen to handle a dataset of up to ~1 trillion input
      data values.  If `num_buckets` is larger, then epsilon is set to
      (1/`num_buckets`) to enforce a stricter error tolerance, because more
      buckets will result in smaller range for each bucket, and so we want the
      the boundaries to be less fuzzy.
      See analyzers.quantiles() for details.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` of the same shape as `x`, with each element in the
    returned tensor representing the bucketized value. Bucketized value is
    in the range [0, num_buckets).

  Raises:
    ValueError: If value of num_buckets is not > 1.
  """
    with tf.name_scope(name, 'bucketize'):
        if not isinstance(num_buckets, int):
            raise TypeError('num_buckets must be an int, got %s',
                            type(num_buckets))

        if num_buckets < 1:
            raise ValueError('Invalid num_buckets %d', num_buckets)

        if epsilon is None:
            # See explanation in args documentation for epsilon.
            epsilon = min(1.0 / num_buckets, 0.01)

        bucket_boundaries = analyzers.quantiles(x, num_buckets, epsilon)
        buckets = quantile_ops.bucketize_with_input_boundaries(
            x, boundaries=bucket_boundaries, name='assign_buckets')

        # Convert to int64 because int32 is not compatible with tf.Example parser.
        # See _TF_EXAMPLE_ALLOWED_TYPES in FixedColumnRepresentation()
        # in tf_metadata/dataset_schema.py
        return tf.to_int64(buckets)