def __init__(self, max_tokens=None, num_oov_indices=1, mask_token="", oov_token="[UNK]", vocabulary=None, encoding=None, invert=False, **kwargs): allowed_dtypes = [dtypes.string] if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes: raise ValueError("StringLookup may only have a dtype in %s." % allowed_dtypes) if "dtype" not in kwargs: kwargs["dtype"] = dtypes.string if encoding is None: encoding = "utf-8" if vocabulary is not None: if isinstance(vocabulary, str): vocabulary = table_utils.get_vocabulary_from_file( vocabulary, encoding) self.encoding = encoding super(StringLookup, self).__init__(max_tokens=max_tokens, num_oov_indices=num_oov_indices, mask_token=mask_token, oov_token=oov_token, vocabulary=vocabulary, invert=invert, **kwargs)
def test_windows_file(self): content = b"line1\r\nline2\r\nline3" with gfile.GFile(self._vocab_path, "wb") as writer: writer.write(content) actual = table_utils.get_vocabulary_from_file(self._vocab_path) self.assertAllEqual(["line1", "line2", "line3"], actual)
def test_only_line_separator_is_stripped(self): expected = ["foo", " foo", "foo ", " foo "] with gfile.GFile(self._vocab_path, "w") as writer: for word in expected: writer.write(word) writer.write(os.linesep) actual = actual = table_utils.get_vocabulary_from_file( self._vocab_path) self.assertAllEqual(expected, actual)
def set_vocabulary(self, vocabulary, idf_weights=None): if isinstance(vocabulary, str): if self.output_mode == index_lookup.TF_IDF: raise RuntimeError("Setting vocabulary directly from a file is not " "supported in TF-IDF mode, since this layer cannot " "read files containing TF-IDF weight data. Please " "read the file using Python and set the vocabulary " "and weights by passing lists or arrays to the " "set_vocabulary function's `vocabulary` and " "`idf_weights` args.") vocabulary = table_utils.get_vocabulary_from_file(vocabulary, self.encoding) super().set_vocabulary(vocabulary, idf_weights=idf_weights)
def __init__(self, max_values=None, num_oov_indices=1, mask_value=0, oov_value=-1, vocabulary=None, invert=False, output_mode=index_lookup.INT, sparse=False, pad_to_max_values=False, **kwargs): allowed_dtypes = [dtypes.int64] if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes: raise ValueError( "The value of the dtype argument for IntegerLookup may " "only be one of %s." % (allowed_dtypes, )) if "dtype" not in kwargs: kwargs["dtype"] = dtypes.int64 # If max_values is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_values is not None and max_values <= 1: raise ValueError("If set, max_values must be greater than 1. " "You passed %s" % (max_values, )) if num_oov_indices < 0: raise ValueError( "num_oov_indices must be greater than or equal to 0. You passed %s" % (num_oov_indices, )) if vocabulary is not None: if isinstance(vocabulary, str): vocabulary = table_utils.get_vocabulary_from_file(vocabulary) vocabulary = [int(v) for v in vocabulary] super(IntegerLookup, self).__init__(max_tokens=max_values, num_oov_indices=num_oov_indices, mask_token=mask_value, oov_token=oov_value, vocabulary=vocabulary, invert=invert, output_mode=output_mode, sparse=sparse, pad_to_max_tokens=pad_to_max_values, **kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell("IntegerLookup").set( True)
def __init__(self, max_tokens=None, num_oov_indices=1, mask_token="", oov_token="[UNK]", vocabulary=None, encoding=None, invert=False, output_mode=index_lookup.INT, sparse=False, pad_to_max_tokens=False, **kwargs): allowed_dtypes = [dtypes.string] if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes: raise ValueError( "The value of the dtype argument for StringLookup may " "only be one of %s." % (allowed_dtypes, )) if "dtype" not in kwargs: kwargs["dtype"] = dtypes.string if encoding is None: encoding = "utf-8" if vocabulary is not None: if isinstance(vocabulary, str): vocabulary = table_utils.get_vocabulary_from_file( vocabulary, encoding) self.encoding = encoding super(StringLookup, self).__init__(max_tokens=max_tokens, num_oov_indices=num_oov_indices, mask_token=mask_token, oov_token=oov_token, vocabulary=vocabulary, invert=invert, output_mode=output_mode, sparse=sparse, pad_to_max_tokens=pad_to_max_tokens, **kwargs) base_preprocessing_layer.keras_kpl_gauge.get_cell("StringLookup").set( True)
def __init__(self, max_values=None, num_oov_indices=1, mask_value=0, oov_value=-1, vocabulary=None, invert=False, **kwargs): allowed_dtypes = [dtypes.int64] if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes: raise ValueError("IntegerLookup may only have a dtype in %s." % allowed_dtypes) if "dtype" not in kwargs: kwargs["dtype"] = dtypes.int64 # If max_values is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_values is not None and max_values <= 1: raise ValueError("If set, max_values must be greater than 1.") if num_oov_indices < 0: raise ValueError("num_oov_indices must be greater than 0. You passed %s" % num_oov_indices) if vocabulary is not None: if isinstance(vocabulary, str): vocabulary = table_utils.get_vocabulary_from_file(vocabulary) vocabulary = [int(v) for v in vocabulary] super(IntegerLookup, self).__init__( max_tokens=max_values, num_oov_indices=num_oov_indices, mask_token=mask_value, oov_token=oov_value, vocabulary=vocabulary, invert=invert, **kwargs) base_preprocessing_layer._kpl_gauge.get_cell("V2").set("IntegerLookup")
def set_vocabulary(self, vocab): if isinstance(vocab, str): vocab = table_utils.get_vocabulary_from_file(vocab, self.encoding) super().set_vocabulary(vocab)
def __init__(self, max_tokens=None, num_oov_tokens=1, vocabulary=None, reserve_zero=True, mask_zero=False, **kwargs): invert = False if invert: allowed_dtypes = [dtypes.int32, dtypes.int64] else: allowed_dtypes = [dtypes.string, dtypes.int32, dtypes.int64] if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes: raise ValueError("TextVectorization may only have a dtype in %s." % allowed_dtypes) if "dtype" not in kwargs: kwargs["dtype"] = dtypes.int64 if invert else dtypes.string # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, max_tokens must be greater than 1.") if num_oov_tokens < 0: raise ValueError( "num_oov_tokens must be greater than 0. You passed %s" % num_oov_tokens) self.invert = invert self.max_tokens = max_tokens self.num_oov_tokens = num_oov_tokens self.reserve_zero = reserve_zero self.mask_zero = mask_zero # We need to reserve at least num_oov_tokens tokens, plus one additional # value if we are reserving the zero value in our output. if reserve_zero: self._reserved_values = (num_oov_tokens + 1) else: self._reserved_values = num_oov_tokens # We need to account for the OOV buckets in our vocabulary size. if max_tokens is not None: self._max_elements = max_tokens - num_oov_tokens else: self._max_elements = None # If there is only one OOV bucket, we can determine the OOV value (either 0 # or 1 depending on whether 0 is reserved) and set that as the default # value of the index_lookup table. If we hav multiple OOV values, we need to # do a further hashing step; to make this easier, we set the OOV value to # -1. (This lets us do a vectorized add and cast to boolean to determine # locations where we need to do extra hashing.) if self.num_oov_tokens == 1: self._oov_value = 1 if reserve_zero else 0 else: self._oov_value = -1 super(IndexLookup, self).__init__(combiner=_IndexLookupCombiner(self.max_tokens), **kwargs) # If the layer's input type is int32, we can only output int32 values - # MutableHashTable doesn't allow us to map int32->int64. if self.dtype == dtypes.int32: self._output_dtype = dtypes.int32 else: self._output_dtype = dtypes.int64 self._table = lookup_ops.MutableHashTable( key_dtype=self.dtype, value_dtype=self._output_dtype, default_value=self._oov_value, name=(self._name + "_index_table")) tracked_table = self._add_trackable(self._table, trainable=False) # This is a workaround for summary() on this layer. Because the table is # not mutable during training, the effective number of parameters (and so # the weight shape) is 0; we add this as an attr so that the parameter # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tensor_shape.TensorShape((0, )) if self.num_oov_tokens <= 1: oov_tokens = None else: oov_start = 1 if reserve_zero else 0 oov_tokens = list(range(oov_start, self._reserved_values)) self._table_handler = table_utils.TableHandler( table=self._table, oov_tokens=oov_tokens, use_v1_apis=self._use_v1_apis()) if vocabulary is not None: if isinstance(vocabulary, str): vocabulary = table_utils.get_vocabulary_from_file(vocabulary) table_utils.validate_vocabulary_is_unique(vocabulary) self.set_vocabulary(vocabulary)
def set_vocabulary(self, vocab): if isinstance(vocab, str): vocab = table_utils.get_vocabulary_from_file(vocab) vocab = [int(v) for v in vocab] super().set_vocabulary(vocab)
def set_vocabulary(self, vocab, df_data=None, oov_df_value=None): """Sets vocabulary (and optionally document frequency) data for this layer. This method sets the vocabulary and DF data for this layer directly, instead of analyzing a dataset through 'adapt'. It should be used whenever the vocab (and optionally document frequency) information is already known. If vocabulary data is already present in the layer, this method will replace it. Arguments: vocab: An array of string tokens, or a path to a file containing one token per line. df_data: An array of document frequency data. Only necessary if the layer output_mode is TFIDF. oov_df_value: The document frequency of the OOV token. Only necessary if output_mode is TFIDF. Raises: ValueError: If there are too many inputs, the inputs do not match, or input data is missing. RuntimeError: If the vocabulary cannot be set when this function is called. This happens when "binary", "count", and "tfidf" modes, if "pad_to_max_tokens" is False and the layer itself has already been called. """ if self._output_mode != TFIDF and df_data is not None: raise ValueError("df_data should only be set if output_mode is TFIDF. " "output_mode is %s." % self._output_mode) if (self._output_mode in [BINARY, COUNT, TFIDF] and self._called and not self._pad_to_max): raise RuntimeError(("When using TextVectorization in {mode} mode and " "pad_to_max_tokens is False, the vocabulary cannot " "be changed after the layer is " "called.").format(mode=self._output_mode)) # Handle reading from a file. We can't do this via TF-IDF, as we don't have # a standard format - we error out and ask our users to parse the file # themselves. if isinstance(vocab, str): if self._output_mode == TFIDF: raise RuntimeError("Setting vocabulary directly from a file is not " "supported in TF-IDF mode, since this layer cannot " "read files containing TF-IDF weight data. Please " "read the file using Python and set the vocab " "and weights by passing lists or arrays to the " "set_vocabulary function's `vocab` and `df_data` " "args.") vocab = table_utils.get_vocabulary_from_file( vocab, self._index_lookup_layer.encoding) self._index_lookup_layer.set_vocabulary(vocab) # When doing raw or integer output, we don't have a Vectorize layer to # manage. In this case, we can return directly. if self._output_mode in [None, INT]: return if not self._pad_to_max or self._max_tokens is None: num_tokens = self._index_lookup_layer.vocab_size() self._vectorize_layer.set_num_elements(num_tokens) if self._output_mode == TFIDF: if df_data is None: raise ValueError("df_data must be set if output_mode is TFIDF") if len(vocab) != len(df_data): raise ValueError("df_data must be the same length as vocab. " "len(df_data) is %s, len(vocab) is %s" % (len(vocab), len(df_data))) if oov_df_value is None: raise ValueError("You must pass an oov_df_value when output_mode is " "TFIDF.") df_data = self._convert_to_ndarray(df_data) if not isinstance(oov_df_value, np.ndarray): oov_df_value = np.array([oov_df_value]) df_data = np.insert(df_data, 0, oov_df_value) self._vectorize_layer.set_tfidf_data(df_data)