コード例 #1
0
    def testInferFeatureSchemaWithSession(self):
        with tf.Graph().as_default() as graph:
            tensors = {
                'a': tf.placeholder(tf.float32, (None, )),
                'b': tf.placeholder(tf.string, (1, 2, 3)),
                'c': tf.placeholder(tf.int64, (None, ))
            }
            schema_inference.set_tensor_schema_override(
                tensors['c'], tf.constant(5), tf.constant(6))
            with tf.Session(graph=graph) as session:
                schema = schema_inference.infer_feature_schema(
                    tensors, graph, session)

        expected_schema = dataset_schema.Schema(
            column_schemas={
                'a':
                dataset_schema.ColumnSchema(
                    tf.float32, [],
                    dataset_schema.FixedColumnRepresentation()),
                'b':
                dataset_schema.ColumnSchema(
                    tf.string, [2, 3],
                    dataset_schema.FixedColumnRepresentation()),
                'c':
                dataset_schema.ColumnSchema(
                    dataset_schema.IntDomain(
                        tf.int64, 5, 6, is_categorical=True), [],
                    dataset_schema.FixedColumnRepresentation())
            })
        self.assertEqual(schema, expected_schema)
コード例 #2
0
def _apply_buckets_with_keys(x, key, key_vocab, bucket_boundaries, name=None):
    """Bucketize a Tensor or SparseTensor where boundaries depend on the index.

  Args:
    x: A 1-d Tensor or SparseTensor.
    key: A 1-d Tensor or SparseTensor with the same size as x.
    key_vocab: A vocab containing all keys.  Must be exhaustive, an
        out-of-vocab entry in `key` will cause a crash.
    bucket_boundaries: A rank-2 Tensor of shape (key_size, num_buckets)
    name: (Optional) A name for this operation.

  Returns:
    A tensor with the same shape as `x` and dtype tf.int64
  """
    with tf.name_scope(name, 'apply_buckets_with_keys'):
        x_values = x.values if isinstance(x, tf.SparseTensor) else x
        key_values = key.values if isinstance(key, tf.SparseTensor) else key

        x_values = tf.to_float(x_values)
        # Convert `key_values` to indices in key_vocab.  We must use apply_function
        # since this uses a Table.
        key_indices = _lookup_key(key_values, key_vocab)

        combined_boundaries, offsets = _combine_bucket_boundaries(
            bucket_boundaries)

        # Apply the per-key offsets to x, which produces offset buckets (where the
        # bucket offset is an integer offset).  Then remove this offset to get the
        # actual per-key buckets for x.
        offset_x = x_values + tf.gather(offsets, key_indices)
        offset_buckets = tf.to_int64(
            quantile_ops.bucketize_with_input_boundaries(
                offset_x, combined_boundaries))
        num_buckets = tf.to_int64(tf.shape(bucket_boundaries)[1])
        bucketized_values = tf.clip_by_value(
            offset_buckets - key_indices * num_buckets, 0, num_buckets)

        # Attach the relevant metadata to result, so that the corresponding
        # output feature will have this metadata set.
        min_value = tf.constant(0, tf.int64)
        max_value = num_buckets
        schema_inference.set_tensor_schema_override(bucketized_values,
                                                    min_value, max_value)

        if isinstance(x, tf.SparseTensor):
            result = tf.SparseTensor(x.indices, bucketized_values,
                                     x.dense_shape)
        else:
            result = bucketized_values

        return result
コード例 #3
0
def apply_buckets(x, bucket_boundaries, name=None):
    """Returns a bucketized column, with a bucket index assigned to each input.

  Args:
    x: A numeric input `Tensor` or `SparseTensor` whose values should be mapped
        to buckets.  For `SparseTensor`s, the non-missing values will be mapped
        to buckets and missing value left missing.
    bucket_boundaries: The bucket boundaries represented as a rank 1 `Tensor`.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` of the same shape as `x`, with each element in the
    returned tensor representing the bucketized value. Bucketized value is
    in the range [0, len(bucket_boundaries)].
  """
    with tf.name_scope(name, 'apply_buckets'):
        x_values = x.values if isinstance(x, tf.SparseTensor) else x
        buckets = quantile_ops.bucketize_with_input_boundaries(
            x_values, boundaries=bucket_boundaries, name='assign_buckets')
        # Convert to int64 because int32 is not compatible with tf.Example parser.
        # See _TF_EXAMPLE_ALLOWED_TYPES in FixedColumnRepresentation()
        # in tf_metadata/dataset_schema.py
        bucketized_values = tf.to_int64(buckets)

        # Attach the relevant metadata to result, so that the corresponding
        # output feature will have this metadata set.
        min_value = tf.constant(0, tf.int64)
        max_value = tf.shape(bucket_boundaries)[1]
        schema_inference.set_tensor_schema_override(bucketized_values,
                                                    min_value, max_value)

        if isinstance(x, tf.SparseTensor):
            result = tf.SparseTensor(x.indices, bucketized_values,
                                     x.dense_shape)
        else:
            result = bucketized_values

        return result
コード例 #4
0
def apply_vocabulary(x,
                     deferred_vocab_filename_tensor,
                     default_value=-1,
                     num_oov_buckets=0,
                     lookup_fn=None,
                     name=None):
    r"""Maps `x` to a vocabulary specified by the deferred tensor.

  This function also writes domain statistics about the vocabulary min and max
  values. Note that the min and max are inclusive, and depend on the vocab size,
  num_oov_buckets and default_value.

  In case one of the tokens contains the '\n' or '\r' characters or is empty it
  will be discarded since we are currently writing the vocabularies as text
  files. This behavior will likely be fixed/improved in the future.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string to which the vocabulary
      transformation should be applied.
      The column names are those intended for the transformed tensors.
    deferred_vocab_filename_tensor: The deferred vocab filename tensor as
      returned by `tft.vocabulary`.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.
    lookup_fn: Optional lookup function, if specified it should take a tensor
      and a deferred vocab filename as an input and return a lookup `op` along
      with the table size, by default `apply_vocab` performs a
      lookup_ops.index_table_from_file for the table lookup.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an
    integer. Each unique string value that appears in the vocabulary
    is mapped to a different integer and integers are consecutive
    starting from zero, and string value not in the vocabulary is
    assigned default_value.
  """
    with tf.name_scope(name, 'apply_vocab'):
        if lookup_fn:
            result, table_size = lookup_fn(x, deferred_vocab_filename_tensor)
        else:
            table = lookup_ops.index_table_from_file(
                deferred_vocab_filename_tensor,
                num_oov_buckets=num_oov_buckets,
                default_value=default_value)
            table_size = table.size()
            result = table.lookup(x)

        # Specify schema overrides which will override the values in the schema
        # with the min and max values, which are deferred as they are only known
        # once the analyzer has run.
        #
        # `table_size` includes the num oov buckets.  The default value is only used
        # if num_oov_buckets <= 0.
        min_value = tf.constant(0, tf.int64)
        max_value = table_size - 1
        if num_oov_buckets <= 0:
            min_value = tf.minimum(min_value, default_value)
            max_value = tf.maximum(max_value, default_value)
        schema_inference.set_tensor_schema_override(
            result.values if isinstance(result, tf.SparseTensor) else result,
            min_value, max_value)

        return result
コード例 #5
0
def _make_tensors_with_override():
    x = tf.placeholder(tf.int64, (None, ))
    schema_inference.set_tensor_schema_override(x, tf.constant(5),
                                                tf.constant(6))
    return {'x': x}
コード例 #6
0
def _make_tensors_with_override(inputs):
    x = tf.identity(inputs['x'])
    schema_inference.set_tensor_schema_override(x, tf.constant(5),
                                                tf.constant(6))
    return {'x': x}