Ejemplo n.º 1
0
  def testCreatePhasesWithApplyFunctionWithOverlappingInputsAndOutputs(self):
    string_placeholder = tf.placeholder(tf.string, shape=(None,))
    def degenerate_function(x):
      """A function whose input tensors and output tensors overlap."""
      return x
    api.apply_function(degenerate_function, string_placeholder)

    phases = impl_helper.create_phases()
    self.assertEqual(len(phases), 0)
Ejemplo n.º 2
0
def string_to_int(x,
                  default_value=-1,
                  top_k=None,
                  frequency_threshold=None,
                  num_oov_buckets=0):
    """Generates a vocabulary for `x` and maps it to an integer with this vocab.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    top_k: Limit the generated vocabulary to the first `top_k` elements. If set
      to None, the full vocabulary is generated.
    frequency_threshold: Limit the generated vocabulary only to elements whose
      frequency is >= to the supplied threshold. If set to None, the full
      vocabulary is generated.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an integer
    where each unique string value is mapped to a different integer and integers
    are consecutive and starting from 0.

  Raises:
    ValueError: If `top_k` or `count_threshold` is negative.
  """
    if top_k is not None:
        top_k = int(top_k)
        if top_k < 0:
            raise ValueError('top_k must be non-negative, but got: %r' % top_k)

    if frequency_threshold is not None:
        frequency_threshold = int(frequency_threshold)
        if frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold must be non-negative, but got: %r' %
                frequency_threshold)

    def _fix_vocab_if_needed(vocab):
        num_to_add = 1 - tf.minimum(tf.size(vocab), 1)
        return tf.concat([
            vocab,
            tf.fill(tf.reshape(num_to_add,
                               (1, )), '__dummy_value__index_zero__')
        ], 0)

    def _apply_vocab(x, vocab):
        table = lookup.string_to_index_table_from_tensor(
            vocab,
            num_oov_buckets=num_oov_buckets,
            default_value=default_value)
        return table.lookup(x)

    vocab = analyzers.uniques(x,
                              top_k=top_k,
                              frequency_threshold=frequency_threshold)
    vocab = _fix_vocab_if_needed(vocab)
    return api.apply_function(_apply_vocab, x, vocab)
Ejemplo n.º 3
0
 def preprocessing_fn(inputs):
   def _subtract_ten(x):
     i = tf.constant(0)
     c = lambda i, x: tf.less(i, 10)
     b = lambda i, x: (tf.add(i, 1), tf.add(x, -1))
     return tf.while_loop(c, b, [i, x])[1]
   scaled_to_0_1 = mappers.scale_to_0_1(
       api.apply_function(_subtract_ten, inputs['x']))
   return {'x_scaled': scaled_to_0_1}
Ejemplo n.º 4
0
    def testCreatePhasesWithControlFlowOpsWrappedInApplyFunction(self):
        int_placeholder = tf.placeholder(tf.int64, shape=(None, ))
        int_placeholder_minus_10 = api.apply_function(_subtract_ten,
                                                      int_placeholder)
        # We need to call an analyzer after the loop because only the transitive
        # parents of analyzers are inspected by create_phases
        mappers.scale_to_0_1(int_placeholder_minus_10)

        phases = impl_helper.create_phases()
        self.assertEqual(len(phases), 1)
        #  tft.scale_to_0_1 uses a single analyzer: analyzers._min_and_max.
        self.assertEqual(len(phases[0].analyzer_infos), 1)
Ejemplo n.º 5
0
def string_to_int(x,
                  default_value=-1,
                  top_k=None,
                  frequency_threshold=None,
                  num_oov_buckets=0,
                  vocab_filename=None):
    """Generates a vocabulary for `x` and maps it to an integer with this vocab.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    top_k: Limit the generated vocabulary to the first `top_k` elements. If set
      to None, the full vocabulary is generated.
    frequency_threshold: Limit the generated vocabulary only to elements whose
      frequency is >= to the supplied threshold. If set to None, the full
      vocabulary is generated.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.
    vocab_filename: The file name for the vocabulary file. If none, the
      "uniques" scope name in the context of this graph will be used as the file
      name. If not None, should be unique within a given preprocessing function.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an integer
    where each unique string value is mapped to a different integer and integers
    are consecutive and starting from 0.

  Raises:
    ValueError: If `top_k` or `frequency_threshold` is negative.
  """
    if top_k is not None:
        top_k = int(top_k)
        if top_k < 0:
            raise ValueError('top_k must be non-negative, but got: %r' % top_k)

    if frequency_threshold is not None:
        frequency_threshold = int(frequency_threshold)
        if frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold must be non-negative, but got: %r' %
                frequency_threshold)

    def _apply_vocab(x, vocabulary_file):
        table = lookup.string_to_index_table_from_file(
            vocabulary_file,
            num_oov_buckets=num_oov_buckets,
            default_value=default_value)
        table_size = table.size()
        return table.lookup(x), table_size

    with tf.name_scope('string_to_int'):
        prefix = None
        if vocab_filename is None:
            prefix = analyzers.VOCAB_FILENAME_PREFIX
        vocab_filename = analyzers.sanitized_vocab_filename(
            vocab_filename, prefix)
        vocabulary_file = analyzers.uniques(
            x,
            top_k=top_k,
            frequency_threshold=frequency_threshold,
            vocab_filename=vocab_filename)
        result, table_size = api.apply_function(_apply_vocab, x,
                                                vocabulary_file)

    # Set the min and max values of the domain, where the max value is a `Future`
    # wrapping the max_value tensor.  Note that min_value is a regular Python
    # value while max_value is a tensor.  This tensor's value cannot be known
    # until the vocab has been computed.
    #
    # `table_size` includes the num oov buckets.  The default value is only used
    # if num_oov_buckets > 0.
    min_value = 0
    max_value = table_size - 1
    if num_oov_buckets <= 0:
        min_value = min(min_value, default_value)
        max_value = tf.maximum(max_value, default_value)
    column_schema = dataset_schema.infer_column_schema_from_tensor(result)
    column_schema.domain = dataset_schema.IntDomain(
        result.dtype,
        min_value=min_value,
        max_value=futures.Future(max_value.name),
        vocabulary_file=vocab_filename)
    api.set_column_schema(result, column_schema)

    return result
Ejemplo n.º 6
0
def apply_vocab(x, deferred_vocab_filename_tensor, default_value=-1,
                num_oov_buckets=0, lookup_fn=None, name=None):
  r"""Maps `x` to a vocabulary specified by the deferred tensor.

  This function also writes domain statistics about the vocabulary min and max
  values. Note that the min and max are inclusive, and depend on the vocab size,
  num_oov_buckets and default_value.

  In case one of the tokens contains the '\n' or '\r' characters or is empty it
  will be discarded since we are currently writing the vocabularies as text
  files. This behavior will likely be fixed/improved in the future.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string to which the vocabulary
      transformation should be applied.
      The column names are those intended for the transformed tensors.
    deferred_vocab_filename_tensor: The deferred vocab filename tensor as
      returned by `tft.uniques`.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.
    lookup_fn: Optional lookup function, if specified it should take a
      tensor and a deferred vocab filename as an input and return a lookup `op`
      along with the table size, by default `apply_vocab` performs a
      lookup.string_to_index_table_from_file for the table lookup.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an
    integer; each unique string value is mapped to a different integer and
    integers are consecutive and start from default_value.
  """

  def _apply_vocab(y, deferred_vocab_filename_tensor):
    table = lookup.index_table_from_file(
        deferred_vocab_filename_tensor,
        num_oov_buckets=num_oov_buckets,
        default_value=default_value)
    table_size = table.size()
    return table.lookup(y), table_size

  with tf.name_scope(name, 'apply_vocab'):
    lookup_fn = lookup_fn or _apply_vocab

    result, table_size = api.apply_function(
        lookup_fn, x, deferred_vocab_filename_tensor)

    # Specify schema overrides which will override the values in the schema
    # with the min and max values, which are deferred as they are only known
    # once the analyzer has run.
    #
    # `table_size` includes the num oov buckets.  The default value is only used
    # if num_oov_buckets > 0.
    min_value = tf.constant(0, tf.int64)
    max_value = table_size - 1
    if num_oov_buckets <= 0:
      min_value = tf.minimum(min_value, default_value)
      max_value = tf.maximum(max_value, default_value)
    api.set_tensor_schema_overrides(result, min_value, max_value)

    return result
Ejemplo n.º 7
0
 def preprocessing_fn(inputs):
   return {
       'index': api.apply_function(lambda x: x, inputs['a'])
   }
Ejemplo n.º 8
0
def apply_vocab(x,
                deferred_vocab_filename_tensor,
                default_value=-1,
                num_oov_buckets=0,
                lookup_fn=None,
                name=None):
    r"""Maps `x` to a vocabulary specified by the deferred tensor.

  This function also writes domain statistics about the vocabulary min and max
  values. Note that the min and max are inclusive, and depend on the vocab size,
  num_oov_buckets and default_value.

  In case one of the tokens contains the '\n' or '\r' characters or is empty it
  will be discarded since we are currently writing the vocabularies as text
  files. This behavior will likely be fixed/improved in the future.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string to which the vocabulary
      transformation should be applied.
      The colum names are those intended for the transformed tensors.
    deferred_vocab_filename_tensor: The deferred vocab filename tensor as
      returned by `tft.uniques`.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.
    lookup_fn: Optional lookup function, if specified it should take a
      tensor and a deferred vocab filename as an input and return a lookup `op`
      along with the table size, by default `apply_vocab` performs a
      lookup.string_to_index_table_from_file for the table lookup.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an
    integer; each unique string value is mapped to a different integer and
    integers are consecutive and start from default_value.
  """
    def _apply_vocab(y, deferred_vocab_filename_tensor):
        table = lookup.index_table_from_file(deferred_vocab_filename_tensor,
                                             num_oov_buckets=num_oov_buckets,
                                             default_value=default_value)
        table_size = table.size()
        return table.lookup(y), table_size

    with tf.name_scope(name, 'apply_vocab'):
        lookup_fn = lookup_fn or _apply_vocab

        result, table_size = api.apply_function(
            lookup_fn, x, deferred_vocab_filename_tensor)

        # Set the min and max values of the domain, where the max value is a
        # `Future` wrapping the max_value tensor.  Note that min_value is a regular
        # Python value while max_value is a tensor.  This tensor's value cannot be
        # known until the vocab has been computed.
        #
        # `table_size` includes the num oov buckets.  The default value is only used
        # if num_oov_buckets > 0.
        min_value = 0
        max_value = table_size - 1
        if num_oov_buckets <= 0:
            min_value = min(min_value, default_value)
            max_value = tf.maximum(max_value, default_value)
        column_schema = dataset_schema.infer_column_schema_from_tensor(result)
        # Extract the relative vocab filename from the absolute pathname.
        file_name_tensor = tf.string_split([deferred_vocab_filename_tensor],
                                           '/').values[-1]
        column_schema.domain = dataset_schema.IntDomain(
            result.dtype,
            min_value=min_value,
            max_value=futures.Future(max_value.name),
            is_categorical=True,
            vocabulary_file=futures.Future(file_name_tensor.name))
        api.set_column_schema(result, column_schema)

        return result
Ejemplo n.º 9
0
def apply_saved_model(model_dir,
                      inputs,
                      tags,
                      signature_name=None,
                      output_keys_in_signature=None):
    """Applies a SavedModel to some `Tensor`s.

  Applies a SavedModel to `inputs`. The SavedModel is specified with
  `model_dir`, `tags` and `signature_name`. Note that the SavedModel will be
  converted to an all-constants graph.

  Args:
    model_dir: A path containing a SavedModel.
    inputs: A dict whose keys are the names from the input signature and whose
        values are `Tensor`s. If there is only one input in the model's input
        signature then `inputs` can be a single `Tensor`.
    tags: The tags specifying which metagraph to load from the SavedModel.
    signature_name: Specify signature of the loaded model. The default value
        None can be used if there is only one signature in the MetaGraphDef.
    output_keys_in_signature: A list of strings which should be a subset of
        the outputs in the signature of the SavedModel. The returned `Tensor`s
        will correspond to specified output `Tensor`s, in the same order. The
        default value None can be used if there is only one output from
        signature.

  Returns:
    A `Tensor` or list of `Tensor`s representing the application of the
        SavedModel.

  Raises:
    ValueError: if
    `inputs` is invalid type, or
    `signature_name` is None but the SavedModel contains multiple signature, or
    `inputs` do not match the signature inputs, or
    `output_keys_in_signature` is not a subset of the signature outputs.
  """
    # Load model, get graph, inputs and outputs.
    loaded_graph = tf.Graph()
    loaded_initializer_op_names = []

    with loaded_graph.as_default():
        session, meta_graph = (
            bundle_shim.load_session_bundle_or_saved_model_bundle_from_path(
                model_dir, tags=tags))
        loaded_initializer_op_names = [
            op.name
            for op in tf.get_collection(tf.GraphKeys.TABLE_INITIALIZERS)
        ]

        if signature_name:
            signature = meta_graph.signature_def[signature_name]
        elif len(meta_graph.signature_def) > 1:
            raise ValueError(
                'The SavedModel contains multiple signatures (%r) but signature_name '
                'was not specified.' % (meta_graph.signature_def.keys(), ))
        else:
            signature = meta_graph.signature_def.values()[0]

    # Generate mapping from tensors in the graph to the input tensors.
    if isinstance(inputs, dict):
        if set(signature.inputs.keys()) != set(inputs.keys()):
            raise ValueError(
                'The keys in `inputs` (%r) do not match inputs of the SavedModel '
                '(%r).' % (inputs.keys(), signature.inputs.keys()))
        input_name_to_tensor_map = {
            signature.inputs[key].name: inputs[key]
            for key in inputs.keys()
        }
    elif len(signature.inputs) != 1:
        raise ValueError(
            'The SavedModel does not have exactly one input (had inputs %r) but '
            '`inputs` was not a dict.' % (signature.inputs.keys(), ))
    else:
        input_name_to_tensor_map = {signature.inputs.values()[0].name: inputs}

    # Get output tensor names.
    if output_keys_in_signature:
        if not set(output_keys_in_signature) <= set(signature.outputs.keys()):
            raise ValueError(
                'output_keys_in_signature (%r) is not a subset of outputs of the '
                'SavedModel (%r).' %
                (output_keys_in_signature, signature.outputs.keys()))

        output_tensor_names = [
            signature.outputs[key].name for key in output_keys_in_signature
        ]
        output_single_tensor = False
    elif len(signature.outputs) != 1:
        raise ValueError(
            'The SavedModel does not have exactly one output (had outputs %r) but '
            'output_keys_in_signature was not specified.' %
            (signature.outputs.keys(), ))
    else:
        output_tensor_names = [signature.outputs.values()[0].name]
        output_single_tensor = True

    # Convert_variables_to_constants() requires op name.
    output_op_names = [
        loaded_graph.get_tensor_by_name(tensor_name).op.name
        for tensor_name in output_tensor_names
    ]
    constant_graph_def = tf.graph_util.convert_variables_to_constants(
        session, loaded_graph.as_graph_def(),
        output_op_names + loaded_initializer_op_names)

    def import_graph_and_return_output_tensors():
        """Imports the model's constant-converted GraphDef into the default graph.

    We must also copy the table initializers from the model's graph into the
    composed graph. As a result, this function must be wrapped in
    api.apply_function().

    Returns:
      The model's output tensor(s).
    """
        returned_elements = tf.import_graph_def(
            constant_graph_def,
            input_map=input_name_to_tensor_map,
            return_elements=output_tensor_names + loaded_initializer_op_names)
        returned_output_tensors = returned_elements[:len(output_tensor_names)]
        returned_initializer_ops = returned_elements[len(output_tensor_names):]

        for initializer_op in returned_initializer_ops:
            tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS,
                                 initializer_op)

        if output_single_tensor:
            assert len(output_tensor_names) == 1
            return returned_output_tensors[0]
        else:
            return returned_output_tensors

    return api.apply_function(import_graph_and_return_output_tensors)