Esempio n. 1
0
    def test_windows_file(self):
        content = b"line1\r\nline2\r\nline3"
        with tf.io.gfile.GFile(self._vocab_path, "wb") as writer:
            writer.write(content)

        actual = table_utils.get_vocabulary_from_file(self._vocab_path)
        self.assertAllEqual(["line1", "line2", "line3"], actual)
Esempio n. 2
0
  def test_only_line_separator_is_stripped(self):
    expected = ["foo", " foo", "foo ", " foo "]
    with tf.io.gfile.GFile(self._vocab_path, "w") as writer:
      for word in expected:
        writer.write(word)
        writer.write(os.linesep)

    actual = actual = table_utils.get_vocabulary_from_file(self._vocab_path)
    self.assertAllEqual(expected, actual)
Esempio n. 3
0
 def set_vocabulary(self, vocab, idf_weights=None):
   if isinstance(vocab, str):
     if self.output_mode == index_lookup.TFIDF:
       raise RuntimeError("Setting vocabulary directly from a file is not "
                          "supported in TF-IDF mode, since this layer cannot "
                          "read files containing TF-IDF weight data. Please "
                          "read the file using Python and set the vocab "
                          "and weights by passing lists or arrays to the "
                          "set_vocabulary function's `vocab` and "
                          "`idf_weights` args.")
     vocab = table_utils.get_vocabulary_from_file(vocab, self.encoding)
   super().set_vocabulary(vocab, idf_weights=idf_weights)
Esempio n. 4
0
    def __init__(self,
                 max_values=None,
                 num_oov_indices=1,
                 mask_value=0,
                 oov_value=-1,
                 vocabulary=None,
                 invert=False,
                 output_mode=index_lookup.INT,
                 sparse=False,
                 pad_to_max_values=False,
                 **kwargs):
        allowed_dtypes = [tf.int64]

        if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
            raise ValueError(
                "The value of the dtype argument for IntegerLookup may "
                "only be one of %s." % (allowed_dtypes, ))

        if "dtype" not in kwargs:
            kwargs["dtype"] = tf.int64

        # If max_values is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_values is not None and max_values <= 1:
            raise ValueError("If set, max_values must be greater than 1. "
                             "You passed %s" % (max_values, ))

        if num_oov_indices < 0:
            raise ValueError(
                "num_oov_indices must be greater than or equal to 0. You passed %s"
                % (num_oov_indices, ))

        if vocabulary is not None:
            if isinstance(vocabulary, str):
                vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
                vocabulary = [int(v) for v in vocabulary]

        super(IntegerLookup,
              self).__init__(max_tokens=max_values,
                             num_oov_indices=num_oov_indices,
                             mask_token=mask_value,
                             oov_token=oov_value,
                             vocabulary=vocabulary,
                             invert=invert,
                             output_mode=output_mode,
                             sparse=sparse,
                             pad_to_max_tokens=pad_to_max_values,
                             **kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell("IntegerLookup").set(
            True)
Esempio n. 5
0
    def __init__(self,
                 max_tokens=None,
                 num_oov_indices=1,
                 mask_token="",
                 oov_token="[UNK]",
                 vocabulary=None,
                 encoding=None,
                 invert=False,
                 output_mode=index_lookup.INT,
                 sparse=False,
                 pad_to_max_tokens=False,
                 **kwargs):
        allowed_dtypes = [tf.string]

        if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
            raise ValueError(
                "The value of the dtype argument for StringLookup may "
                "only be one of %s." % (allowed_dtypes, ))

        if "dtype" not in kwargs:
            kwargs["dtype"] = tf.string

        if encoding is None:
            encoding = "utf-8"

        if vocabulary is not None:
            if isinstance(vocabulary, str):
                vocabulary = table_utils.get_vocabulary_from_file(
                    vocabulary, encoding)

        self.encoding = encoding

        super(StringLookup, self).__init__(max_tokens=max_tokens,
                                           num_oov_indices=num_oov_indices,
                                           mask_token=mask_token,
                                           oov_token=oov_token,
                                           vocabulary=vocabulary,
                                           invert=invert,
                                           output_mode=output_mode,
                                           sparse=sparse,
                                           pad_to_max_tokens=pad_to_max_tokens,
                                           **kwargs)
        base_preprocessing_layer.keras_kpl_gauge.get_cell("StringLookup").set(
            True)
Esempio n. 6
0
 def set_vocabulary(self, vocab):
     if isinstance(vocab, str):
         vocab = table_utils.get_vocabulary_from_file(vocab)
         vocab = [int(v) for v in vocab]
     super().set_vocabulary(vocab)
Esempio n. 7
0
 def set_vocabulary(self, vocab):
     if isinstance(vocab, str):
         vocab = table_utils.get_vocabulary_from_file(vocab, self.encoding)
     super().set_vocabulary(vocab)
Esempio n. 8
0
    def set_vocabulary(self, vocab, df_data=None, oov_df_value=None):
        """Sets vocabulary (and optionally document frequency) data for this layer.

    This method sets the vocabulary and DF data for this layer directly, instead
    of analyzing a dataset through 'adapt'. It should be used whenever the vocab
    (and optionally document frequency) information is already known. If
    vocabulary data is already present in the layer, this method will replace
    it.

    Arguments:
      vocab: An array of string tokens, or a path to a file containing one
        token per line.
      df_data: An array of document frequency data. Only necessary if the layer
        output_mode is TFIDF.
      oov_df_value: The document frequency of the OOV token. Only necessary if
        output_mode is TFIDF.

    Raises:
      ValueError: If there are too many inputs, the inputs do not match, or
        input data is missing.
      RuntimeError: If the vocabulary cannot be set when this function is
        called. This happens when "binary", "count", and "tfidf" modes,
        if "pad_to_max_tokens" is False and the layer itself has already been
        called.
    """
        if self._output_mode != TFIDF and df_data is not None:
            raise ValueError(
                "df_data should only be set if output_mode is TFIDF. "
                "output_mode is %s." % self._output_mode)

        if (self._output_mode in [BINARY, COUNT, TFIDF] and self._called
                and not self._pad_to_max):
            raise RuntimeError(
                ("When using TextVectorization in {mode} mode and "
                 "pad_to_max_tokens is False, the vocabulary cannot "
                 "be changed after the layer is "
                 "called.").format(mode=self._output_mode))

        # Handle reading from a file. We can't do this via TF-IDF, as we don't have
        # a standard format - we error out and ask our users to parse the file
        # themselves.
        if isinstance(vocab, str):
            if self._output_mode == TFIDF:
                raise RuntimeError(
                    "Setting vocabulary directly from a file is not "
                    "supported in TF-IDF mode, since this layer cannot "
                    "read files containing TF-IDF weight data. Please "
                    "read the file using Python and set the vocab "
                    "and weights by passing lists or arrays to the "
                    "set_vocabulary function's `vocab` and `df_data` "
                    "args.")
            vocab = table_utils.get_vocabulary_from_file(
                vocab, self._index_lookup_layer.encoding)

        self._index_lookup_layer.set_vocabulary(vocab)

        # When doing raw or integer output, we don't have a Vectorize layer to
        # manage. In this case, we can return directly.
        if self._output_mode in [None, INT]:
            return

        if not self._pad_to_max or self._max_tokens is None:
            num_tokens = self._index_lookup_layer.vocab_size()
            self._vectorize_layer.set_num_elements(num_tokens)

        if self._output_mode == TFIDF:
            if df_data is None:
                raise ValueError("df_data must be set if output_mode is TFIDF")
            if len(vocab) != len(df_data):
                raise ValueError("df_data must be the same length as vocab. "
                                 "len(df_data) is %s, len(vocab) is %s" %
                                 (len(vocab), len(df_data)))
            if oov_df_value is None:
                raise ValueError(
                    "You must pass an oov_df_value when output_mode is "
                    "TFIDF.")

            df_data = self._convert_to_ndarray(df_data)
            if not isinstance(oov_df_value, np.ndarray):
                oov_df_value = np.array([oov_df_value])
            df_data = np.insert(df_data, 0, oov_df_value)
            self._vectorize_layer.set_tfidf_data(df_data)