Beispiel #1
0
def string_to_int(x,
                  default_value=-1,
                  top_k=None,
                  frequency_threshold=None,
                  num_oov_buckets=0):
    """Generates a vocabulary for `x` and maps it to an integer with this vocab.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    top_k: Limit the generated vocabulary to the first `top_k` elements. If set
      to None, the full vocabulary is generated.
    frequency_threshold: Limit the generated vocabulary only to elements whose
      frequency is >= to the supplied threshold. If set to None, the full
      vocabulary is generated.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an integer
    where each unique string value is mapped to a different integer and integers
    are consecutive and starting from 0.

  Raises:
    ValueError: If `top_k` or `count_threshold` is negative.
  """
    if top_k is not None:
        top_k = int(top_k)
        if top_k < 0:
            raise ValueError('top_k must be non-negative, but got: %r' % top_k)

    if frequency_threshold is not None:
        frequency_threshold = int(frequency_threshold)
        if frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold must be non-negative, but got: %r' %
                frequency_threshold)

    def _fix_vocab_if_needed(vocab):
        num_to_add = 1 - tf.minimum(tf.size(vocab), 1)
        return tf.concat([
            vocab,
            tf.fill(tf.reshape(num_to_add,
                               (1, )), '__dummy_value__index_zero__')
        ], 0)

    def _apply_vocab(x, vocab):
        table = lookup.string_to_index_table_from_tensor(
            vocab,
            num_oov_buckets=num_oov_buckets,
            default_value=default_value)
        return table.lookup(x)

    vocab = analyzers.uniques(x,
                              top_k=top_k,
                              frequency_threshold=frequency_threshold)
    vocab = _fix_vocab_if_needed(vocab)
    return api.apply_function(_apply_vocab, x, vocab)
Beispiel #2
0
def string_to_int(x,
                  default_value=-1,
                  top_k=None,
                  frequency_threshold=None,
                  num_oov_buckets=0,
                  vocab_filename=None,
                  name=None):
    r"""Generates a vocabulary for `x` and maps it to an integer with this vocab.

  In case one of the tokens contains the '\n' or '\r' characters or is empty it
  will be discarded since we are currently writing the vocabularies as text
  files. This behavior will likely be fixed/improved in the future.

  Note that this function will cause a vocabulary to be computed.  For large
  datasets it is highly recommended to either set frequency_threshold or top_k
  to control the size of the vocabulary, and also the run time of this
  operation.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    top_k: Limit the generated vocabulary to the first `top_k` elements. If set
      to None, the full vocabulary is generated.
    frequency_threshold: Limit the generated vocabulary only to elements whose
      absolute frequency is >= to the supplied threshold. If set to None, the
      full vocabulary is generated.  Absolute frequency means the number of
      occurences of the element in the dataset, as opposed to the proportion of
      instances that contain that element.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.
    vocab_filename: The file name for the vocabulary file. If none, the
      "uniques" scope name in the context of this graph will be used as the file
      name. If not None, should be unique within a given preprocessing function.
      NOTE To make your pipelines resilient to implementation details please
      set `vocab_filename` when you are using the vocab_filename on a downstream
      component.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an
    integer; each unique string value is mapped to a different integer and
    integers are consecutive and start from default_value.

  Raises:
    ValueError: If `top_k` or `frequency_threshold` is negative.
  """
    with tf.name_scope(name, 'string_to_int'):
        deferred_vocab_and_filename = analyzers.uniques(
            x,
            top_k=top_k,
            frequency_threshold=frequency_threshold,
            vocab_filename=vocab_filename)
        return apply_vocab(x, deferred_vocab_and_filename, default_value,
                           num_oov_buckets)
Beispiel #3
0
def string_to_int(x, default_value=-1, top_k=None, frequency_threshold=None):
    """Generates a vocabulary for `x` and maps it to an integer with this vocab.

  Args:
    x: A `Column` representing a string value or values.
    default_value: The value to use for out-of-vocabulary values.
    top_k: Limit the generated vocabulary to the first `top_k` elements. If set
      to None, the full vocabulary is generated.
    frequency_threshold: Limit the generated vocabulary only to elements whose
      frequency is >= to the supplied threshold. If set to None, the full
      vocabulary is generated.

  Returns:
    A `Column` where each string value is mapped to an integer where each unique
    string value is mapped to a different integer and integers are consecutive
    and starting from 0.

  Raises:
    ValueError: If `top_k` or `count_threshold` is negative.
  """
    if top_k is not None:
        top_k = long(top_k)
        if top_k < 0:
            raise ValueError('top_k must be non-negative, but got: %r' % top_k)

    if frequency_threshold is not None:
        frequency_threshold = long(frequency_threshold)
        if frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold must be non-negative, but got: %r' %
                frequency_threshold)

    def map_to_int(x, vocab):
        table = lookup.string_to_index_table_from_tensor(
            vocab, default_value=default_value)
        return table.lookup(x)

    return api.map(
        map_to_int, x,
        analyzers.uniques(x,
                          top_k=top_k,
                          frequency_threshold=frequency_threshold))
Beispiel #4
0
def string_to_int(x,
                  default_value=-1,
                  top_k=None,
                  frequency_threshold=None,
                  num_oov_buckets=0,
                  vocab_filename=None):
    """Generates a vocabulary for `x` and maps it to an integer with this vocab.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    top_k: Limit the generated vocabulary to the first `top_k` elements. If set
      to None, the full vocabulary is generated.
    frequency_threshold: Limit the generated vocabulary only to elements whose
      frequency is >= to the supplied threshold. If set to None, the full
      vocabulary is generated.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.
    vocab_filename: The file name for the vocabulary file. If none, the
      "uniques" scope name in the context of this graph will be used as the file
      name. If not None, should be unique within a given preprocessing function.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an integer
    where each unique string value is mapped to a different integer and integers
    are consecutive and starting from 0.

  Raises:
    ValueError: If `top_k` or `frequency_threshold` is negative.
  """
    if top_k is not None:
        top_k = int(top_k)
        if top_k < 0:
            raise ValueError('top_k must be non-negative, but got: %r' % top_k)

    if frequency_threshold is not None:
        frequency_threshold = int(frequency_threshold)
        if frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold must be non-negative, but got: %r' %
                frequency_threshold)

    def _apply_vocab(x, vocabulary_file):
        table = lookup.string_to_index_table_from_file(
            vocabulary_file,
            num_oov_buckets=num_oov_buckets,
            default_value=default_value)
        table_size = table.size()
        return table.lookup(x), table_size

    with tf.name_scope('string_to_int'):
        prefix = None
        if vocab_filename is None:
            prefix = analyzers.VOCAB_FILENAME_PREFIX
        vocab_filename = analyzers.sanitized_vocab_filename(
            vocab_filename, prefix)
        vocabulary_file = analyzers.uniques(
            x,
            top_k=top_k,
            frequency_threshold=frequency_threshold,
            vocab_filename=vocab_filename)
        result, table_size = api.apply_function(_apply_vocab, x,
                                                vocabulary_file)

    # Set the min and max values of the domain, where the max value is a `Future`
    # wrapping the max_value tensor.  Note that min_value is a regular Python
    # value while max_value is a tensor.  This tensor's value cannot be known
    # until the vocab has been computed.
    #
    # `table_size` includes the num oov buckets.  The default value is only used
    # if num_oov_buckets > 0.
    min_value = 0
    max_value = table_size - 1
    if num_oov_buckets <= 0:
        min_value = min(min_value, default_value)
        max_value = tf.maximum(max_value, default_value)
    column_schema = dataset_schema.infer_column_schema_from_tensor(result)
    column_schema.domain = dataset_schema.IntDomain(
        result.dtype,
        min_value=min_value,
        max_value=futures.Future(max_value.name),
        vocabulary_file=vocab_filename)
    api.set_column_schema(result, column_schema)

    return result
Beispiel #5
0
def string_to_int(x, default_value=-1, top_k=None, frequency_threshold=None):
    """Generates a vocabulary for `x` and maps it to an integer with this vocab.

  Args:
    x: A `Column` representing a string value or values.
    default_value: The value to use for out-of-vocabulary values.
    top_k: Limit the generated vocabulary to the first `top_k` elements. If set
      to None, the full vocabulary is generated.
    frequency_threshold: Limit the generated vocabulary only to elements whose
      frequency is >= to the supplied threshold. If set to None, the full
      vocabulary is generated.

  Returns:
    A `Column` where each string value is mapped to an integer where each unique
    string value is mapped to a different integer and integers are consecutive
    and starting from 0.

  Raises:
    ValueError: If `top_k` or `count_threshold` is negative.
  """
    if top_k is not None:
        top_k = long(top_k)
        if top_k < 0:
            raise ValueError('top_k must be non-negative, but got: %r' % top_k)

    if frequency_threshold is not None:
        frequency_threshold = long(frequency_threshold)
        if frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold must be non-negative, but got: %r' %
                frequency_threshold)

    def map_to_int(x, vocab):
        """Maps string tensor into indexes using vocab.

    It uses a dummy vocab when the input vocab is empty.

    Args:
      x : a Tensor/SparseTensor of string.
      vocab : a Tensor/SparseTensor containing unique string values within x.

    Returns:
      a Tensor/SparseTensor of indexes (int) of the same shape as x.
    """
        def fix_vocab_if_needed(vocab):
            num_to_add = 1 - tf.minimum(tf.size(vocab), 1)
            return tf.concat([
                vocab,
                tf.fill(tf.reshape(num_to_add,
                                   (1, )), '__dummy_value__index_zero__')
            ], 0)

        table = lookup.string_to_index_table_from_tensor(
            fix_vocab_if_needed(vocab), default_value=default_value)
        return table.lookup(x)

    return api.map(
        map_to_int, x,
        analyzers.uniques(x,
                          top_k=top_k,
                          frequency_threshold=frequency_threshold))