Beispiel #1
0
    def vocabulary_size_by_name(self, vocab_filename: str) -> int:
        """Like vocabulary_file_by_name, but returns the size of vocabulary."""
        actual_vocab_filename = sanitized_vocab_filename(vocab_filename)
        vocab_path = self.vocabulary_file_by_name(actual_vocab_filename)
        if not vocab_path:
            raise ValueError(
                'Could not compute vocabulary size for {}, does not exist'.
                format(vocab_filename))
        elif vocab_path.endswith(actual_vocab_filename):
            with tf.io.gfile.GFile(vocab_path, 'rb') as f:
                return sum(1 for _ in f)
        elif vocab_path.endswith('tfrecord.gz'):
            dataset = tf.data.TFRecordDataset(vocab_path,
                                              compression_type='GZIP')

            def reduce_fn(accum, elem):
                return tf.size(elem, out_type=tf.int64,
                               name='vocabulary_size') + accum

            return _get_tensor_value(
                dataset.batch(tf.int32.max).reduce(tf.constant(0, tf.int64),
                                                   reduce_fn))
        else:
            raise ValueError('Could not find vocabulary: {} ({})'.format(
                vocab_filename, vocab_path))
Beispiel #2
0
  def vocabulary_file_by_name(self, vocab_filename):
    """Returns the vocabulary file path created in the preprocessing function.

    `vocab_filename` must be the name used as the vocab_filename argument to
    tft.compute_and_apply_vocabulary or tft.vocabulary. By convention, this
    should be the name of the feature that the vocab was computed for, where
    possible.

    Args:
      vocab_filename: The relative filename to lookup.
    """
    return os.path.join(self.transform_savedmodel_dir,
                        tf.saved_model.ASSETS_DIRECTORY,
                        sanitized_vocab_filename(filename=vocab_filename))
    def vocabulary_file_by_name(self, vocab_filename: str) -> Optional[str]:
        """Returns the vocabulary file path created in the preprocessing function.

    `vocab_filename` must either be (i) the name used as the vocab_filename
    argument to tft.compute_and_apply_vocabulary / tft.vocabulary or (ii) the
    key used in tft.annotate_asset.

    When a mapping has been specified by calls to tft.annotate_asset, it will be
    checked first for the provided filename. If present, this filename will be
    used directly to construct a path.

    If the mapping does not exist or `vocab_filename` is not present within it,
    we will default to sanitizing `vocab_filename` and searching for files
    matching it within the assets directory.

    In either case, if the constructed path does not point to an existing file
    within the assets subdirectory, we will return a None.

    Args:
      vocab_filename: The vocabulary name to lookup.
    """
        mapping_path = os.path.join(self._transform_output_dir,
                                    self.TRANSFORMED_METADATA_DIR,
                                    self.ASSET_MAP)

        mapping = {}
        if tf.io.gfile.exists(mapping_path):
            with tf.io.gfile.GFile(mapping_path) as f:
                mapping = json.loads(f.read())
                if vocab_filename in mapping:
                    vocab_path = os.path.join(self.transform_savedmodel_dir,
                                              tf.saved_model.ASSETS_DIRECTORY,
                                              mapping[vocab_filename])
                    if tf.io.gfile.exists(vocab_path):
                        return vocab_path

        prefix = os.path.join(
            self.transform_savedmodel_dir, tf.saved_model.ASSETS_DIRECTORY,
            sanitized_vocab_filename(filename=vocab_filename))
        files = tf.io.gfile.glob(prefix) + tf.io.gfile.glob(
            '{}.tfrecord.gz'.format(prefix))
        if not files:
            return None
        if len(files) != 1:
            raise ValueError(
                'Found too many vocabulary files: {}'.format(files))
        return files[0]
Beispiel #4
0
 def vocabulary_by_name(self, vocab_filename):
   """Like vocabulary_file_by_name but returns a list."""
   actual_vocab_filename = sanitized_vocab_filename(vocab_filename)
   vocab_path = self.vocabulary_file_by_name(actual_vocab_filename)
   if not vocab_path:
     raise ValueError('Could not read vocabulary: {}, does not exist'.format(
         vocab_filename))
   elif vocab_path.endswith(actual_vocab_filename):
     with tf.io.gfile.GFile(
         self.vocabulary_file_by_name(actual_vocab_filename), 'rb') as f:
       return [l.rstrip() for l in f]
   elif vocab_path.endswith('tfrecord.gz'):
     dataset = tf_utils.read_tfrecord_vocabulary_dataset(vocab_path)
     vocab_tensor = dataset.batch(tf.int32.max).reduce(
         tf.constant([], dtype=tf.string),
         lambda state, elem: tf.concat([state, elem], axis=-1))
     # Using as_numpy_iterator only works when executing eagerly.
     return _get_tensor_value(vocab_tensor).tolist()
   else:
     raise ValueError('Could not find vocabulary: {} ({})'.format(
         vocab_filename, vocab_path))
Beispiel #5
0
  def vocabulary_file_by_name(self, vocab_filename):
    """Returns the vocabulary file path created in the preprocessing function.

    `vocab_filename` must be the name used as the vocab_filename argument to
    tft.compute_and_apply_vocabulary or tft.vocabulary. By convention, this
    should be the name of the feature that the vocab was computed for, where
    possible.

    Args:
      vocab_filename: The relative filename to lookup.
    """
    prefix = os.path.join(self.transform_savedmodel_dir,
                          tf.saved_model.ASSETS_DIRECTORY,
                          sanitized_vocab_filename(filename=vocab_filename))
    files = tf.io.gfile.glob(prefix) + tf.io.gfile.glob(
        '{}.tfrecord.gz'.format(prefix))
    if not files:
      return None
    if len(files) != 1:
      raise ValueError('Found too many vocabulary files: {}'.format(files))
    return files[0]
Beispiel #6
0
def _get_approx_vocab_filename(vocab_filename: Optional[str],
                               store_frequency: bool) -> str:
    """Returns a sanitized vocabulary filename with appropriate prefix applied.

  Args:
    vocab_filename: The file name for the approximate vocabulary file. If None,
      the "approximate_vocabulary" scope name in the context of this graph will
      be used as the file name.
    store_frequency: A bool that is true when the vocabulary for which this
      generates a filename stores term frequency. False otherwise.

  Returns:
    A valid filename.
  """
    if vocab_filename is not None:
        prefix = None
    elif store_frequency:
        prefix = _APPROXIMATE_VOCAB_FILENAME_PREFIX
    else:
        prefix = _APPROXIMATE_VOCAB_FREQUENCY_FILENAME_PREFIX

    # Make the file name path safe.
    return analyzers.sanitized_vocab_filename(vocab_filename, prefix=prefix)
Beispiel #7
0
def string_to_int(x,
                  default_value=-1,
                  top_k=None,
                  frequency_threshold=None,
                  num_oov_buckets=0,
                  vocab_filename=None):
    """Generates a vocabulary for `x` and maps it to an integer with this vocab.

  Args:
    x: A `Tensor` or `SparseTensor` of type tf.string.
    default_value: The value to use for out-of-vocabulary values, unless
      'num_oov_buckets' is greater than zero.
    top_k: Limit the generated vocabulary to the first `top_k` elements. If set
      to None, the full vocabulary is generated.
    frequency_threshold: Limit the generated vocabulary only to elements whose
      frequency is >= to the supplied threshold. If set to None, the full
      vocabulary is generated.
    num_oov_buckets:  Any lookup of an out-of-vocabulary token will return a
      bucket ID based on its hash if `num_oov_buckets` is greater than zero.
      Otherwise it is assigned the `default_value`.
    vocab_filename: The file name for the vocabulary file. If none, the
      "uniques" scope name in the context of this graph will be used as the file
      name. If not None, should be unique within a given preprocessing function.

  Returns:
    A `Tensor` or `SparseTensor` where each string value is mapped to an integer
    where each unique string value is mapped to a different integer and integers
    are consecutive and starting from 0.

  Raises:
    ValueError: If `top_k` or `frequency_threshold` is negative.
  """
    if top_k is not None:
        top_k = int(top_k)
        if top_k < 0:
            raise ValueError('top_k must be non-negative, but got: %r' % top_k)

    if frequency_threshold is not None:
        frequency_threshold = int(frequency_threshold)
        if frequency_threshold < 0:
            raise ValueError(
                'frequency_threshold must be non-negative, but got: %r' %
                frequency_threshold)

    def _apply_vocab(x, vocabulary_file):
        table = lookup.string_to_index_table_from_file(
            vocabulary_file,
            num_oov_buckets=num_oov_buckets,
            default_value=default_value)
        table_size = table.size()
        return table.lookup(x), table_size

    with tf.name_scope('string_to_int'):
        prefix = None
        if vocab_filename is None:
            prefix = analyzers.VOCAB_FILENAME_PREFIX
        vocab_filename = analyzers.sanitized_vocab_filename(
            vocab_filename, prefix)
        vocabulary_file = analyzers.uniques(
            x,
            top_k=top_k,
            frequency_threshold=frequency_threshold,
            vocab_filename=vocab_filename)
        result, table_size = api.apply_function(_apply_vocab, x,
                                                vocabulary_file)

    # Set the min and max values of the domain, where the max value is a `Future`
    # wrapping the max_value tensor.  Note that min_value is a regular Python
    # value while max_value is a tensor.  This tensor's value cannot be known
    # until the vocab has been computed.
    #
    # `table_size` includes the num oov buckets.  The default value is only used
    # if num_oov_buckets > 0.
    min_value = 0
    max_value = table_size - 1
    if num_oov_buckets <= 0:
        min_value = min(min_value, default_value)
        max_value = tf.maximum(max_value, default_value)
    column_schema = dataset_schema.infer_column_schema_from_tensor(result)
    column_schema.domain = dataset_schema.IntDomain(
        result.dtype,
        min_value=min_value,
        max_value=futures.Future(max_value.name),
        vocabulary_file=vocab_filename)
    api.set_column_schema(result, column_schema)

    return result