Beispiel #1
0
def compute_and_apply_vocabulary(x,
                                 default_value=-1,
                                 top_k=None,
                                 frequency_threshold=None,
                                 num_oov_buckets=0,
                                 vocab_filename=None,
                                 name=None):
    r"""Generates a vocabulary for `x` and maps it to an integer with this vocab.

  In case one of the tokens contains the '\n' or '\r' characters or is empty it
  will be discarded since we are currently writing the vocabularies as text
  files. This behavior will likely be fixed/improved in the future.

  Note that this function will cause a vocabulary to be computed.  For large
  datasets it is highly recommended to either set frequency_threshold or top_k
  to control the size of the vocabulary, and also the run time of this
  operation.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    top_k: Limit the generated vocabulary to the first `top_k` elements. If set
      to None, the full vocabulary is generated.
    frequency_threshold: Limit the generated vocabulary only to elements whose
      absolute frequency is >= to the supplied threshold. If set to None, the
      full vocabulary is generated.  Absolute frequency means the number of
      occurences of the element in the dataset, as opposed to the proportion of
      instances that contain that element.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.
    vocab_filename: The file name for the vocabulary file. If None, a name based
      on the scope name in the context of this graph will be used as the
      file name. If not None, should be unique within a given preprocessing
      function.
      NOTE in order to make your pipelines resilient to implementation details
      please set `vocab_filename` when you are using the vocab_filename on a
      downstream component.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an
    integer. Each unique string value that appears in the vocabulary
    is mapped to a different integer and integers are consecutive starting from
    zero. String value not in the vocabulary is assigned default_value.

  Raises:
    ValueError: If `top_k` or `frequency_threshold` is negative.
  """
    with tf.name_scope(name, 'compute_and_apply_vocabulary'):
        deferred_vocab_and_filename = analyzers.vocabulary(
            x=x,
            top_k=top_k,
            frequency_threshold=frequency_threshold,
            vocab_filename=vocab_filename)
        return apply_vocabulary(x, deferred_vocab_and_filename, default_value,
                                num_oov_buckets)
def _one_phase_preprocessing_fn(inputs):
    x_plus_one = _plus_one(inputs['x'])
    subtracted = tf.sparse.add(tf.cast(inputs['y'], tf.float32),
                               -analyzers.mean(x_plus_one))
    _ = analyzers.vocabulary(inputs['s'])
    return {'subtracted': subtracted}
def _side_affect_preprocessing_fn(inputs):
    _ = analyzers.vocabulary(inputs['s'])
    return {}
Beispiel #4
0
def compute_and_apply_vocabulary(x,
                                 default_value=-1,
                                 top_k=None,
                                 frequency_threshold=None,
                                 num_oov_buckets=0,
                                 vocab_filename=None,
                                 weights=None,
                                 labels=None,
                                 use_adjusted_mutual_info=False,
                                 min_diff_from_avg=0.0,
                                 coverage_top_k=None,
                                 coverage_frequency_threshold=None,
                                 key_fn=None,
                                 fingerprint_shuffle=False,
                                 name=None):
    r"""Generates a vocabulary for `x` and maps it to an integer with this vocab.

  In case one of the tokens contains the '\n' or '\r' characters or is empty it
  will be discarded since we are currently writing the vocabularies as text
  files. This behavior will likely be fixed/improved in the future.

  Note that this function will cause a vocabulary to be computed.  For large
  datasets it is highly recommended to either set frequency_threshold or top_k
  to control the size of the vocabulary, and also the run time of this
  operation.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    top_k: Limit the generated vocabulary to the first `top_k` elements. If set
      to None, the full vocabulary is generated.
    frequency_threshold: Limit the generated vocabulary only to elements whose
      absolute frequency is >= to the supplied threshold. If set to None, the
      full vocabulary is generated.  Absolute frequency means the number of
      occurences of the element in the dataset, as opposed to the proportion of
      instances that contain that element.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.
    vocab_filename: The file name for the vocabulary file. If None, a name based
      on the scope name in the context of this graph will be used as the
      file name. If not None, should be unique within a given preprocessing
      function.
      NOTE in order to make your pipelines resilient to implementation details
      please set `vocab_filename` when you are using the vocab_filename on a
      downstream component.
    weights: (Optional) Weights `Tensor` for the vocabulary. It must have the
      same shape as x.
    labels: (Optional) Labels `Tensor` for the vocabulary. It must have dtype
      int64, have values 0 or 1, and have the same shape as x.
    use_adjusted_mutual_info: If true, use adjusted mutual information.
    min_diff_from_avg: Mutual information of a feature will be adjusted to zero
      whenever the difference between count of the feature with any label and
      its expected count is lower than min_diff_from_average.
    coverage_top_k: (Optional), (Experimental) The minimum number of elements
      per key to be included in the vocabulary.
    coverage_frequency_threshold: (Optional), (Experimental) Limit the coverage
      arm of the vocabulary only to elements whose absolute frequency is >= this
      threshold for a given key.
    key_fn: (Optional), (Experimental) A fn that takes in a single entry of `x`
      and returns the corresponding key for coverage calculation. If this is
      `None`, no coverage arm is added to the vocabulary.
    fingerprint_shuffle: (Optional), (Experimental) Whether to sort the
      vocabularies by fingerprint instead of counts. This is useful for load
      balancing on the training parameter servers. Shuffle only happens while
      writing the files, so all the filters above will still take effect.
    name: (Optional) A name for this operation.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an
    integer. Each unique string value that appears in the vocabulary
    is mapped to a different integer and integers are consecutive starting from
    zero. String value not in the vocabulary is assigned default_value.

  Raises:
    ValueError: If `top_k` or `frequency_threshold` is negative.
      If `coverage_top_k` or `coverage_frequency_threshold` is negative.
  """
    with tf.name_scope(name, 'compute_and_apply_vocabulary'):
        deferred_vocab_and_filename = analyzers.vocabulary(
            x=x,
            top_k=top_k,
            frequency_threshold=frequency_threshold,
            vocab_filename=vocab_filename,
            weights=weights,
            labels=labels,
            use_adjusted_mutual_info=use_adjusted_mutual_info,
            min_diff_from_avg=min_diff_from_avg,
            coverage_top_k=coverage_top_k,
            coverage_frequency_threshold=coverage_frequency_threshold,
            key_fn=key_fn,
            fingerprint_shuffle=fingerprint_shuffle,
            name=name)
        return apply_vocabulary(x, deferred_vocab_and_filename, default_value,
                                num_oov_buckets)
def _one_phase_preprocessing_fn(inputs):
    x_centered = inputs['x'] - analyzers.mean(inputs['y'])
    _ = analyzers.vocabulary(inputs['s'])
    return {'x_centered': x_centered}