def test_infer_column_schema_from_tensor(self):
    dense = tf.constant([[1., 2.], [3., 4.]], dtype=tf.float32, shape=[2, 2])
    column_schema = sch.infer_column_schema_from_tensor(dense)
    expected_column_schema = sch.ColumnSchema(
        tf.float32, [2], sch.FixedColumnRepresentation())
    self.assertEqual(expected_column_schema, column_schema)

    varlen = tf.sparse_placeholder(tf.string)
    column_schema = sch.infer_column_schema_from_tensor(varlen)
    expected_column_schema = sch.ColumnSchema(
        tf.string, [None], sch.ListColumnRepresentation())
    self.assertEqual(expected_column_schema, column_schema)
Beispiel #2
0
def apply_buckets(x, bucket_boundaries, name=None):
    """Returns a bucketized column, with a bucket index assigned to each input.

  Args:
    x: A numeric input `Tensor` whose values should be mapped to buckets.
    bucket_boundaries: The bucket boundaries represented as a list.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` of the same shape as `x`, with each element in the
    returned tensor representing the bucketized value. Bucketized value is
    in the range [0, len(bucket_boundaries)].
  """
    with tf.name_scope(name, 'apply_buckets'):
        buckets = quantile_ops.bucketize_with_input_boundaries(
            x, boundaries=bucket_boundaries, name='assign_buckets')
        # Convert to int64 because int32 is not compatible with tf.Example parser.
        # See _TF_EXAMPLE_ALLOWED_TYPES in FixedColumnRepresentation()
        # in tf_metadata/dataset_schema.py
        result = tf.to_int64(buckets)

        # Attach the relevant metadata to result, so that the corresponding
        # output feature will have this metadata set.
        max_value = tf.shape(bucket_boundaries)[1]
        column_schema = dataset_schema.infer_column_schema_from_tensor(result)
        column_schema.domain = dataset_schema.IntDomain(
            result.dtype,
            min_value=0,
            max_value=futures.Future(max_value.name),
            is_categorical=True)
        api.set_column_schema(result, column_schema)
        return result
  def test_infer_column_schema_from_tensor(self):
    dense = tf.constant([[1., 2.], [3., 4.]], dtype=tf.float32, shape=[2, 2])
    column_schema = sch.infer_column_schema_from_tensor(dense)
    expected_column_schema = sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.float32),
                                sch.LogicalShape([sch.Axis(2)])),
        sch.FixedColumnRepresentation())
    self.assertEqual(expected_column_schema, column_schema)

    varlen = tf.sparse_placeholder(tf.string)
    column_schema = sch.infer_column_schema_from_tensor(varlen)
    expected_column_schema = sch.ColumnSchema(
        sch.LogicalColumnSchema(sch.dtype_to_domain(tf.string),
                                sch.LogicalShape([sch.Axis(None)])),
        sch.ListColumnRepresentation())
    self.assertEqual(expected_column_schema, column_schema)
Beispiel #4
0
def infer_feature_schema(graph, tensors):
  """Given a dict of tensors, creates a `Schema`.

  Infers a schema, in the format of a tf.Transform `Schema`, for the given
  dictionary of tensors.  If a tensor has a ColumnSchema set using
  api.set_column_schema then this schema will be used instead of inferring a
  schema.

  Args:
    graph: The graph that tensors belong to.
    tensors: A dict mapping column names to tensors. The tensors should have a
      0'th dimension interpreted as the batch dimension.

  Returns:
    A `Schema` object.
  """
  schema_overrides = api.get_column_schemas(graph)

  # If the tensor already has a schema attached, use that. Otherwise infer the
  # schema from the underlying tensor.
  return dataset_schema.Schema({
      name: schema_overrides.get(
          tensor, dataset_schema.infer_column_schema_from_tensor(tensor))
      for name, tensor in six.iteritems(tensors)
  })
def infer_feature_schema(features, graph, session=None):
    """Given a dict of tensors, creates a `Schema`.

  Infers a schema, in the format of a tf.Transform `Schema`, for the given
  dictionary of tensors.

  If there is an override specified, we override the inferred schema for the
  given feature's tensor.  An override has the meaning that we should set
  is_categorical=True.  If session is not provided then we just set
  is_categorical=True, and if the session is provided then was also compute
  values of the tensors representing the min and max values and set them in the
  schema.

  Args:
    features: A dict mapping column names to `Tensor` or `SparseTensor`s. The
        `Tensor` or `SparseTensor`s should have a 0'th dimension which is
        interpreted as the batch dimension.
    graph: A tf.Graph, used to look up schema overrides even they are not
        computed.
    session: (optional) A `tf.Session` used to compute schema overrides.  If
        None, schema overrides will not be computed.

  Returns:
    A `Schema` object.
  """
    tensor_overrides = _get_tensor_schema_overrides(graph)

    column_schemas = {}
    for name, tensor in six.iteritems(features):
        column_schema = dataset_schema.infer_column_schema_from_tensor(tensor)
        override_min_and_max = tensor_overrides.get(
            tensor.values if isinstance(tensor, tf.SparseTensor) else tensor)
        if override_min_and_max is not None:
            assert column_schema.domain.dtype == tf.int64
            assert isinstance(column_schema.domain, dataset_schema.IntDomain)
            if session is not None:
                min_value, max_value = session.run(override_min_and_max)
            else:
                min_value, max_value = None, None
            column_schemas[name] = dataset_schema.ColumnSchema(
                dataset_schema.IntDomain(tf.int64,
                                         min_value,
                                         max_value,
                                         is_categorical=True),
                column_schema.axes, column_schema.representation)
        else:
            column_schemas[name] = column_schema

    return dataset_schema.Schema(column_schemas)
Beispiel #6
0
def infer_feature_schema(tensors):
    """Given a dict of tensors, creates a `Schema`.

  Infers a schema, in the format of a tf.Transform `Schema`, for the given
  dictionary of tensors.

  Args:
    tensors: A dict mapping column names to tensors. The tensors should have a
      0'th dimension interpreted as the batch dimension.

  Returns:
    A `Schema` object.
  """
    return dataset_schema.Schema({
        name: dataset_schema.infer_column_schema_from_tensor(tensor)
        for name, tensor in six.iteritems(tensors)
    })
Beispiel #7
0
def infer_feature_schema(columns):
    """Given a dict of columns, creates a `Schema`.

  Infers a schema, in the format of a tf.Transform `Schema`, for the given
  dictionary of columns.

  Args:
    columns: A dict mapping column names to `Column`s. The tensors represented
      by these columns should have a 0'th dimension interpreted as the batch
      dimension. In order to pass a tensor representing a single instance, it
      must be wrapped in a batch of size 1.

  Returns:
    A `Schema` object.
  """
    # If the column already has a schema attached, use that. Otherwise infer the
    # schema from the underlying tensor.
    return dataset_schema.Schema({
        name: (column.schema if column.schema else
               dataset_schema.infer_column_schema_from_tensor(column.tensor))
        for name, column in six.iteritems(columns)
    })
Beispiel #8
0
def string_to_int(x,
                  default_value=-1,
                  top_k=None,
                  frequency_threshold=None,
                  num_oov_buckets=0,
                  vocab_filename=None):
    """Generates a vocabulary for `x` and maps it to an integer with this vocab.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    top_k: Limit the generated vocabulary to the first `top_k` elements. If set
      to None, the full vocabulary is generated.
    frequency_threshold: Limit the generated vocabulary only to elements whose
      frequency is >= to the supplied threshold. If set to None, the full
      vocabulary is generated.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.
    vocab_filename: The file name for the vocabulary file. If none, the
      "uniques" scope name in the context of this graph will be used as the file
      name. If not None, should be unique within a given preprocessing function.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an integer
    where each unique string value is mapped to a different integer and integers
    are consecutive and starting from 0.

  Raises:
    ValueError: If `top_k` or `frequency_threshold` is negative.
  """
    if top_k is not None:
        top_k = int(top_k)
        if top_k < 0:
            raise ValueError('top_k must be non-negative, but got: %r' % top_k)

    if frequency_threshold is not None:
        frequency_threshold = int(frequency_threshold)
        if frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold must be non-negative, but got: %r' %
                frequency_threshold)

    def _apply_vocab(x, vocabulary_file):
        table = lookup.string_to_index_table_from_file(
            vocabulary_file,
            num_oov_buckets=num_oov_buckets,
            default_value=default_value)
        table_size = table.size()
        return table.lookup(x), table_size

    with tf.name_scope('string_to_int'):
        prefix = None
        if vocab_filename is None:
            prefix = analyzers.VOCAB_FILENAME_PREFIX
        vocab_filename = analyzers.sanitized_vocab_filename(
            vocab_filename, prefix)
        vocabulary_file = analyzers.uniques(
            x,
            top_k=top_k,
            frequency_threshold=frequency_threshold,
            vocab_filename=vocab_filename)
        result, table_size = api.apply_function(_apply_vocab, x,
                                                vocabulary_file)

    # Set the min and max values of the domain, where the max value is a `Future`
    # wrapping the max_value tensor.  Note that min_value is a regular Python
    # value while max_value is a tensor.  This tensor's value cannot be known
    # until the vocab has been computed.
    #
    # `table_size` includes the num oov buckets.  The default value is only used
    # if num_oov_buckets > 0.
    min_value = 0
    max_value = table_size - 1
    if num_oov_buckets <= 0:
        min_value = min(min_value, default_value)
        max_value = tf.maximum(max_value, default_value)
    column_schema = dataset_schema.infer_column_schema_from_tensor(result)
    column_schema.domain = dataset_schema.IntDomain(
        result.dtype,
        min_value=min_value,
        max_value=futures.Future(max_value.name),
        vocabulary_file=vocab_filename)
    api.set_column_schema(result, column_schema)

    return result
Beispiel #9
0
def apply_vocab(x,
                deferred_vocab_filename_tensor,
                default_value=-1,
                num_oov_buckets=0,
                lookup_fn=None,
                name=None):
    r"""Maps `x` to a vocabulary specified by the deferred tensor.

  This function also writes domain statistics about the vocabulary min and max
  values. Note that the min and max are inclusive, and depend on the vocab size,
  num_oov_buckets and default_value.

  In case one of the tokens contains the '\n' or '\r' characters or is empty it
  will be discarded since we are currently writing the vocabularies as text
  files. This behavior will likely be fixed/improved in the future.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string to which the vocabulary
      transformation should be applied.
      The colum names are those intended for the transformed tensors.
    deferred_vocab_filename_tensor: The deferred vocab filename tensor as
      returned by `tft.uniques`.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.
    lookup_fn: Optional lookup function, if specified it should take a
      tensor and a deferred vocab filename as an input and return a lookup `op`
      along with the table size, by default `apply_vocab` performs a
      lookup.string_to_index_table_from_file for the table lookup.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an
    integer; each unique string value is mapped to a different integer and
    integers are consecutive and start from default_value.
  """
    def _apply_vocab(y, deferred_vocab_filename_tensor):
        table = lookup.index_table_from_file(deferred_vocab_filename_tensor,
                                             num_oov_buckets=num_oov_buckets,
                                             default_value=default_value)
        table_size = table.size()
        return table.lookup(y), table_size

    with tf.name_scope(name, 'apply_vocab'):
        lookup_fn = lookup_fn or _apply_vocab

        result, table_size = api.apply_function(
            lookup_fn, x, deferred_vocab_filename_tensor)

        # Set the min and max values of the domain, where the max value is a
        # `Future` wrapping the max_value tensor.  Note that min_value is a regular
        # Python value while max_value is a tensor.  This tensor's value cannot be
        # known until the vocab has been computed.
        #
        # `table_size` includes the num oov buckets.  The default value is only used
        # if num_oov_buckets > 0.
        min_value = 0
        max_value = table_size - 1
        if num_oov_buckets <= 0:
            min_value = min(min_value, default_value)
            max_value = tf.maximum(max_value, default_value)
        column_schema = dataset_schema.infer_column_schema_from_tensor(result)
        # Extract the relative vocab filename from the absolute pathname.
        file_name_tensor = tf.string_split([deferred_vocab_filename_tensor],
                                           '/').values[-1]
        column_schema.domain = dataset_schema.IntDomain(
            result.dtype,
            min_value=min_value,
            max_value=futures.Future(max_value.name),
            is_categorical=True,
            vocabulary_file=futures.Future(file_name_tensor.name))
        api.set_column_schema(result, column_schema)

        return result