def bucketize(x, num_buckets, epsilon=None, weights=None, name=None): """Returns a bucketized column, with a bucket index assigned to each input. Args: x: A numeric input `Tensor` or `SparseTensor` whose values should be mapped to buckets. For a `SparseTensor` only non-missing values will be included in the quantiles computation, and the result of `bucketize` will be a `SparseTensor` with non-missing values mapped to buckets. num_buckets: Values in the input `x` are divided into approximately equal-sized buckets, where the number of buckets is num_buckets. This is a hint. The actual number of buckets computed can be less or more than the requested number. Use the generated metadata to find the computed number of buckets. epsilon: (Optional) Error tolerance, typically a small fraction close to zero. If a value is not specified by the caller, a suitable value is computed based on experimental results. For `num_buckets` less than 100, the value of 0.01 is chosen to handle a dataset of up to ~1 trillion input data values. If `num_buckets` is larger, then epsilon is set to (1/`num_buckets`) to enforce a stricter error tolerance, because more buckets will result in smaller range for each bucket, and so we want the boundaries to be less fuzzy. See analyzers.quantiles() for details. weights: (Optional) Weights tensor for the quantiles. Tensor must have the same shape as x. name: (Optional) A name for this operation. Returns: A `Tensor` of the same shape as `x`, with each element in the returned tensor representing the bucketized value. Bucketized value is in the range [0, actual_num_buckets). Sometimes the actual number of buckets can be different than num_buckets hint, for example in case the number of distinct values is smaller than num_buckets, or in cases where the input values are not uniformly distributed. Raises: ValueError: If value of num_buckets is not > 1. """ with tf.name_scope(name, 'bucketize'): if not isinstance(num_buckets, int): raise TypeError('num_buckets must be an int, got %s' % type(num_buckets)) if num_buckets < 1: raise ValueError('Invalid num_buckets %d' % num_buckets) if epsilon is None: # See explanation in args documentation for epsilon. epsilon = min(1.0 / num_buckets, 0.01) x_values = x.values if isinstance(x, tf.SparseTensor) else x bucket_boundaries = analyzers.quantiles(x_values, num_buckets, epsilon, weights) return apply_buckets(x, bucket_boundaries)
def bucketize(x, num_buckets, epsilon=None, name=None): """Returns a bucketized column, with a bucket index assigned to each input. Args: x: A numeric input `Tensor` whose values should be mapped to buckets. num_buckets: Values in the input `x` are divided into approximately equal-sized buckets, where the number of buckets is num_buckets. epsilon: (Optional) Error tolerance, typically a small fraction close to zero. If a value is not specified by the caller, a suitable value is computed based on experimental results. For `num_buckets` less than 100, the value of 0.01 is chosen to handle a dataset of up to ~1 trillion input data values. If `num_buckets` is larger, then epsilon is set to (1/`num_buckets`) to enforce a stricter error tolerance, because more buckets will result in smaller range for each bucket, and so we want the the boundaries to be less fuzzy. See analyzers.quantiles() for details. name: (Optional) A name for this operation. Returns: A `Tensor` of the same shape as `x`, with each element in the returned tensor representing the bucketized value. Bucketized value is in the range [0, num_buckets). Raises: ValueError: If value of num_buckets is not > 1. """ with tf.name_scope(name, 'bucketize'): if not isinstance(num_buckets, int): raise TypeError('num_buckets must be an int, got %s', type(num_buckets)) if num_buckets < 1: raise ValueError('Invalid num_buckets %d', num_buckets) if epsilon is None: # See explanation in args documentation for epsilon. epsilon = min(1.0 / num_buckets, 0.01) bucket_boundaries = analyzers.quantiles(x, num_buckets, epsilon) buckets = quantile_ops.bucketize_with_input_boundaries( x, boundaries=bucket_boundaries, name='assign_buckets') # Convert to int64 because int32 is not compatible with tf.Example parser. # See _TF_EXAMPLE_ALLOWED_TYPES in FixedColumnRepresentation() # in tf_metadata/dataset_schema.py return tf.to_int64(buckets)