def __init__(self, vocab_lookup_table, suffix_indicator="##", max_bytes_per_word=100, max_chars_per_token=None, token_out_type=dtypes.int64, unknown_token="[UNK]", split_unknown_characters=False, lower_case=False, keep_whitespace=False, normalization_form=None, preserve_unused_token=False): super(BertTokenizer, self).__init__() _tf_text_bert_tokenizer_op_create_counter.get_cell().increase_by(1) if isinstance(vocab_lookup_table, str) or isinstance( vocab_lookup_table, ops.Tensor): init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table) vocab_lookup_table = lookup_ops.StaticVocabularyTableV1( init, num_oov_buckets=1, lookup_key_dtype=dtypes.string) self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace, normalization_form, preserve_unused_token) self._wordpiece_tokenizer = WordpieceTokenizer( vocab_lookup_table, suffix_indicator, max_bytes_per_word, max_chars_per_token, token_out_type, unknown_token, split_unknown_characters)
def __init__(self, vocab_lookup_table, suffix_indicator="##", max_bytes_per_word=100, max_chars_per_token=None, token_out_type=dtypes.int64, unknown_token="[UNK]", split_unknown_characters=False, lower_case=False, keep_whitespace=False, normalization_form=None, preserve_unused_token=False): if isinstance(vocab_lookup_table, str) or isinstance( vocab_lookup_table, ops.Tensor): init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table) vocab_lookup_table = lookup_ops.StaticVocabularyTableV1( init, num_oov_buckets=1, lookup_key_dtype=dtypes.string) print("Before ", type(lower_case)) if isinstance(lower_case, ops.Tensor): lower_case = tf.compat.v1.get_default_session().run(lower_case) print("After ", type(lower_case)) self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace, normalization_form, preserve_unused_token) self._wordpiece_tokenizer = WordpieceTokenizer( vocab_lookup_table, suffix_indicator, max_bytes_per_word, max_chars_per_token, token_out_type, unknown_token, split_unknown_characters)
def __init__(self, vocab_lookup_table, suffix_indicator='##', max_bytes_per_word=100, max_chars_per_token=None, token_out_type=dtypes.int64, unknown_token='[UNK]', split_unknown_characters=False): """Initializes the WordpieceTokenizer. Args: vocab_lookup_table: A lookup table implementing the LookupInterface containing the vocabulary of subwords or a string which is the file path to the vocab.txt file. suffix_indicator: (optional) The characters prepended to a wordpiece to indicate that it is a suffix to another subword. Default is '##'. max_bytes_per_word: (optional) Max size of input token. Default is 100. max_chars_per_token: (optional) Max size of subwords, excluding suffix indicator. If known, providing this improves the efficiency of decoding long words. token_out_type: (optional) The type of the token to return. This can be `tf.int64` or `tf.int32` IDs, or `tf.string` subwords. The default is `tf.int64`. unknown_token: (optional) The string value to substitute for an unknown token. Default is "[UNK]". If set to `None`, no substitution occurs. If `token_out_type` is `tf.int32`/`tf.int64`, the `vocab_lookup_table` is used (after substitution) to convert the unknown token to an integer. split_unknown_characters: (optional) Whether to split out single unknown characters as subtokens. If False (default), words containing unknown characters will be treated as single unknown tokens. """ super(WordpieceTokenizer, self).__init__() _tf_text_wordpiece_tokenizer_op_create_counter.get_cell().increase_by( 1) if isinstance(vocab_lookup_table, str) or (isinstance(vocab_lookup_table, ops.Tensor) and vocab_lookup_table.dtype == dtypes.string): init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table) vocab_lookup_table = lookup_ops.StaticVocabularyTableV1( init, num_oov_buckets=1, lookup_key_dtype=dtypes.string) if not isinstance(vocab_lookup_table, lookup_ops.LookupInterface): raise TypeError('Unable to build a lookup table from {}'.format( vocab_lookup_table)) self._vocab_lookup_table = vocab_lookup_table self._suffix_indicator = suffix_indicator self._max_bytes_per_word = max_bytes_per_word self._max_chars_per_token = (0 if max_chars_per_token is None else max_chars_per_token) self._token_out_type = token_out_type self._unknown_token = unknown_token if unknown_token else '[UNK]' self._use_unknown_token = True if unknown_token else False self._split_unknown_characters = split_unknown_characters
def _make_model_with_tables(self): default_val = -1 keys = constant_op.constant(["brain", "salad", "surgery"]) values = constant_op.constant([0, 1, 2], dtypes.int64) table1_initializer = lookup_ops.KeyValueTensorInitializer(keys, values) table1 = lookup_ops.HashTable(table1_initializer, default_val) table2_file = self._make_asset("test\nfoo\nbrain\n") table2_initializer = lookup_ops.TextFileIdTableInitializer(table2_file) table2 = lookup_ops.HashTable(table2_initializer, default_val) def _make_lookup_function(table): signature = [tensor_spec.TensorSpec(None, dtypes.string)] return def_function.function(input_signature=signature)( lambda x: table.lookup(x)) # pylint: disable=unnecessary-lambda root = tracking.AutoTrackable() root.table1 = table1 root.lookup1 = _make_lookup_function(table1) root.table2 = table2 root.lookup2 = _make_lookup_function(table2) return root
def __init__(self, vocab_lookup_table, suffix_indicator="##", max_bytes_per_word=100, max_chars_per_token=None, token_out_type=dtypes.int64, unknown_token="[UNK]", split_unknown_characters=False, lower_case=False, keep_whitespace=False, normalization_form=None): if isinstance(vocab_lookup_table, str): init = lookup_ops.TextFileIdTableInitializer(vocab_lookup_table) vocab_lookup_table = lookup_ops.StaticVocabularyTableV1( init, num_oov_buckets=1, lookup_key_dtype=dtypes.string) self._basic_tokenizer = BasicTokenizer(lower_case, keep_whitespace, normalization_form) self._wordpiece_tokenizer = WordpieceTokenizer( vocab_lookup_table, suffix_indicator, max_bytes_per_word, max_chars_per_token, token_out_type, unknown_token, split_unknown_characters)
def _create_table(self, vocab, num_oov=100): init = lookup_ops.TextFileIdTableInitializer(vocab) return lookup_ops.StaticVocabularyTableV1(init, num_oov)