def translate(sentence, transformer): tokenizer_en = tf_text.BertTokenizer('vocabs/vocab_en.txt') tokenizer_es = tf_text.BertTokenizer('vocabs/vocab_es.txt') sentence = tf.convert_to_tensor([sentence]) encoder_input = tokenizer_en.tokenize(sentence) encoder_input = encoder_input.merge_dims(-2, -1) encoder_input = add_start_end(encoder_input).to_tensor() output = tf.convert_to_tensor([START]) output = tf.expand_dims(output, 0) for i in range(MAX_LENGTH): predictions = transformer(encoder_input, output, False) predictions = predictions[:, -1:, :] predicted_id = tf.argmax(predictions, axis=-1) output = tf.concat([output, predicted_id], axis=-1) if predicted_id == END: break text = tokenizer_es.detokenize(output)[0].numpy() text = tf.strings.reduce_join(text, separator=' ', axis=-1) return text.numpy().decode('utf-8')[8:-6]
def __init__(self, reserved_tokens, vocab_path): self.tokenizer = text.BertTokenizer(vocab_path, lower_case=False) self._reserved_tokens = reserved_tokens self._vocab_path = tf.saved_model.Asset(vocab_path) vocab = pathlib.Path(vocab_path).read_text().splitlines() self.vocab = tf.Variable(vocab) ## Create the signatures for export: # Include a tokenize signature for a batch of strings. self.tokenize.get_concrete_function( tf.TensorSpec(shape=[None], dtype=tf.string)) # Include `detokenize` and `lookup` signatures for: # * `Tensors` with shapes [tokens] and [batch, tokens] # * `RaggedTensors` with shape [batch, tokens] self.detokenize.get_concrete_function( tf.TensorSpec(shape=[None, None], dtype=tf.int64)) self.detokenize.get_concrete_function( tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64)) self.lookup.get_concrete_function( tf.TensorSpec(shape=[None, None], dtype=tf.int64)) self.lookup.get_concrete_function( tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64)) # These `get_*` methods take no arguments self.get_vocab_size.get_concrete_function() self.get_vocab_path.get_concrete_function() self.get_reserved_tokens.get_concrete_function()
def show_html(self, token_impact: np.ndarray, initial_tokens: np.ndarray, label_index: int = 0): # sp = spm.SentencePieceProcessor(model_file='../inputs/embd/sentencepiece_bpe.model') tokenizer = text.BertTokenizer("../inputs/bert_tokens.model") arr = np.zeros(99757) color_map = cm.get_cmap("Reds") token_impact /= np.max(token_impact) # normalization to range [0; 1] with open("../outputs/text_{}.html".format(self.model_name), "a") as file: file.write("<div><h2>Author #{}</h2>\n".format(label_index)) for token, impact in zip(initial_tokens, token_impact): # if impact > 0.5: arr[int(token)] += 1 local_impact = self.get_color(color_map, impact) word = tokenizer.detokenize([[int(token)]]).to_list()[0][0].decode("utf-8") # special tokens if word == "TAB": word = "    " elif word == "SPC": word = " " elif word == "NLN": file.write("<br>") continue file.write("<span style='background-color: rgba({}, {}, {}, {})'>{}</span>" .format(*local_impact, word)) file.write("</div>") return arr
def get_tf_tokenizer(module_handle, tokenization_info=None): """Creates a preprocessing function.""" LOGGER.debug("(get_tf_tokenizer): get_tokenization_info") # We get tokenization info to know where the vocab is and if the model # is lower cased if tokenization_info is None: tokenization_info = get_tokenization_info(module_handle=module_handle) LOGGER.debug("(get_tf_tokenizer): tf.lookup.TextFileInitializer") # Create a lookup table initializer from a text file (the vocab file) table_initializer = tf.lookup.TextFileInitializer( filename=tokenization_info["vocab_file"], key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE, value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER) LOGGER.debug("(get_tf_tokenizer): tf.lookup.StaticVocabularyTable") # Make the table itself vocab_lookup_table = tf.lookup.StaticVocabularyTable( initializer=table_initializer, num_oov_buckets=1, lookup_key_dtype=tf.string) LOGGER.debug("(get_tf_tokenizer): tf_text.BertTokenizer") # Build the tokenizer tokenizer = tf_text.BertTokenizer( vocab_lookup_table=vocab_lookup_table, lower_case=tokenization_info["do_lower_case"]) LOGGER.debug("(get_tf_tokenizer): Done") return tokenizer, vocab_lookup_table
def tokenize_single_sentence(self, sequence, max_len=128, addCLS=True, addSEP=True): """Tokenize a single sentence to ID according to the vocab.txt provided. Add special tokens according to config.""" tokenizer = text.BertTokenizer(self.vocab_dir, token_out_type=tf.int64) word_id = tokenizer.tokenize(sequence) word_id = word_id.merge_dims(1, 2)[:, :max_len] word_id = word_id.to_tensor(default_value=self.PAD_ID) if addCLS: CLSToken = tf.fill([tf.shape(sequence)[0], 1], self.CLS_ID) word_id = word_id[:, :max_len - 1] word_id = tf.concat([CLSToken, word_id], axis=1) if addSEP: SEPToken = tf.fill([tf.shape(sequence)[0], 1], self.SEP_ID) word_id = word_id[:, :max_len - 1] word_id = tf.concat([word_id, SEPToken], axis=1) word_id = tf.pad(word_id, [[0, 0], [0, max_len]], constant_values=self.PAD_ID) word_id = tf.slice(word_id, [0, 0], [-1, max_len]) # Mask to distinguish padded values. input_mask = tf.cast(word_id > 0, tf.int64) # Mask to distinguish two sentences. In this case, just one sentence. segment_id = tf.fill(tf.shape(input_mask), tf.constant(0, dtype=tf.int64)) return word_id, input_mask, segment_id
def __init__(self, *, vocab_file: str, lower_case: bool, tokenize_with_offsets: bool = False, **kwargs): """Initialize a BertTokenizer layer. Args: vocab_file: A Python string with the path of the vocabulary file. This is a text file with newline-separated wordpiece tokens. This layer initializes a lookup table from it that gets used with text.BertTokenizer. lower_case: A Python boolean forwarded to text.BertTokenizer. If true, input text is converted to lower case (where applicable) before tokenization. This must be set to match the way in which the vocab_file was created. tokenize_with_offsets: A Python boolean. If true, this layer calls BertTokenizer.tokenize_with_offsets() instead of plain .tokenize() and outputs a triple of (tokens, start_offsets, limit_offsets) insead of just tokens. **kwargs: standard arguments to Layer(). Raises: ImportError: if importing tensorflow_text failed. """ _check_if_tf_text_installed() self.tokenize_with_offsets = tokenize_with_offsets self._vocab_table = self._create_vocab_table(vocab_file) self._special_tokens_dict = self._create_special_tokens_dict( self._vocab_table, vocab_file) super().__init__(**kwargs) self._bert_tokenizer = text.BertTokenizer(self._vocab_table, lower_case=lower_case)
def __init__(self, params): super().__init__(params) self._tokenizer = tf_text.BertTokenizer( params.vocab_path, lower_case=True, max_bytes_per_word=200, token_out_type=tf.int32, )
def _tokenize(stringA): """Tokenize the two sentences and insert appropriate tokens""" tokenizer = text.BertTokenizer( "vocab.txt", token_out_type=tf.string, ) stringA = tf.squeeze(stringA) idA = tokenizer.tokenize(stringA) #idB = tokenizer.tokenize(stringB) return idA.merge_dims(-2, -1).to_sparse()
def __post_init__(self): tokenizer = tensorflow_text.BertTokenizer(self.vocab_path, token_out_type=tf.int32, lower_case=True) with tf.io.gfile.GFile(self.vocab_path) as f: vocab = f.read().split('\n') cls_token = vocab.index('[CLS]') # Work-around for frozen dataclasses: # https://stackoverflow.com/questions/53756788 object.__setattr__(self, 'cls_token', cls_token) object.__setattr__(self, '_tokenizer', tokenizer)
def __init__(self, *, vocab_file: str, lower_case: Optional[bool] = None, tokenize_with_offsets: bool = False, tokenizer_kwargs: Optional[Mapping[Text, Any]] = None, **kwargs): """Initialize a `BertTokenizer` layer. Args: vocab_file: A Python string with the path of the vocabulary file. This is a text file with newline-separated wordpiece tokens. This layer initializes a lookup table from it that gets used with `text.BertTokenizer`. lower_case: Optional boolean forwarded to `text.BertTokenizer`. If true, input text is converted to lower case (where applicable) before tokenization. This must be set to match the way in which the `vocab_file` was created. If passed, this overrides whatever value may have been passed in `tokenizer_kwargs`. tokenize_with_offsets: A Python boolean. If true, this layer calls `text.BertTokenizer.tokenize_with_offsets()` instead of plain `text.BertTokenizer.tokenize()` and outputs a triple of `(tokens, start_offsets, limit_offsets)` insead of just tokens. tokenizer_kwargs: Optional mapping with keyword arguments to forward to `text.BertTokenizer`'s constructor. **kwargs: Standard arguments to `Layer()`. Raises: ImportError: If importing `tensorflow_text` failed. """ _check_if_tf_text_installed() self.tokenize_with_offsets = tokenize_with_offsets # TODO(b/177326279): Stop storing the vocab table initializer as an # attribute when https://github.com/tensorflow/tensorflow/issues/46456 # has been fixed in the TensorFlow versions of the TF Hub users that load # a SavedModel created from this layer. Due to that issue, loading such a # SavedModel forgets to add .vocab_table._initializer as a trackable # dependency of .vocab_table, so that saving it again to a second SavedModel # (e.g., the final model built using TF Hub) does not properly track # the ._vocab_table._initializer._filename as an Asset. self._vocab_table, self._vocab_initializer_donotuse = ( self._create_vocab_table_and_initializer(vocab_file)) self._special_tokens_dict = self._create_special_tokens_dict( self._vocab_table, vocab_file) super().__init__(**kwargs) tokenizer_kwargs = dict(tokenizer_kwargs or {}) if lower_case is not None: tokenizer_kwargs["lower_case"] = lower_case self._bert_tokenizer = text.BertTokenizer(self._vocab_table, **tokenizer_kwargs)
def build(self, input_shape: tf.TensorShape) -> None: self.tokenizer = tftext.BertTokenizer( tf.lookup.StaticVocabularyTable( tf.lookup.KeyValueTensorInitializer( self.vocab, list(range(len(self.vocab))), key_dtype=tf.string, value_dtype=tf.int64, ), 1, ), max_chars_per_token=self.max_chars_per_token, ) super().build(input_shape)
def initial_preprocess(self, df_path: str, tmp_dataset_filename: str): df = self._initial_load(df_path) df = df[(df.n_lines > 0)] # tokenize. requires time (approx 1h) df.flines = df.flines.apply(self._insert_tokens) print("updated") text_dataset = tf.data.Dataset.from_tensor_slices(df.flines.values) vocab = bert_vocab.bert_vocab_from_dataset( text_dataset, **bert_vocab_args ) self._write_vocab_file("../inputs/bert_tokens.model", vocab) print("saved") # read the tokenizer tokenizer = text.BertTokenizer("../inputs/bert_tokens.model") # reduce the size of the dataset according to the n_tokens df.index = np.arange(len(df)) df["n_tokens"] = df.flines.apply(lambda x: tokenizer.tokenize(x).shape[0]) df = df[df.n_tokens <= self.input_size] # reindex df.index = np.arange(len(df)) # reduce size df = self._user_selection_and_encoding(df, 50, 450) # long saving # The issue is that `tokenizer.tokenize()` do not always return a shape (-1, 1). # Some elements of the result of the function could be a list, e.g. [[2929, 8524]]. # >> tokenizer.detokenize([[2929, 8524]]) # < tf.RaggedTensor[[b'visdist']] > # >> tokenizer.detokenize([[2929]]) # < tf.RaggedTensor[[b'vis']] > # >> tokenizer.detokenize([[8524]]) # < tf.RaggedTensor[[b'##dist']] > # I have decided to flatten these lists. df["tokens"] = df.flines.apply(lambda x: tokenizer.tokenize(x).to_list()) df.tokens = df.tokens.apply(lambda x: list(pd.core.common.flatten(x))) dataset = df[["user", "tokens", "task"]] # shuffle dataset dataset = dataset.sample(frac=1) def rsh(x): arr = np.array(x) arr = np.resize(arr, (self.input_size, 1)) return arr.tolist() dataset.tokens = dataset.tokens.apply(rsh) dataset.to_json(tmp_dataset_filename)
def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ test = tf.constant(['test sentence']) tokenizer = text.BertTokenizer( "vocab.txt", token_out_type=tf.string, ) output = tokenizer.tokenize(test) return inputs
def convert_huggingface_tokenizer(huggingface_tokenizer, suffix_indicator="##", max_chars_per_token=None, split_unknown_characters=True, lower_case=True, keep_whitespace=False, normalization_form=None, preserve_unused_token=True, dtype=tf.int32): vocab_lookup_table = tf.lookup.StaticHashTable( tf.lookup.KeyValueTensorInitializer( keys=list(huggingface_tokenizer.vocab.keys()), values=tf.constant(list(huggingface_tokenizer.vocab.values()), dtype=tf.int64)), default_value=0) special_ids_mask_table = tf.lookup.StaticHashTable( tf.lookup.KeyValueTensorInitializer( keys=tf.constant(huggingface_tokenizer.all_special_ids, dtype=dtype), values=tf.constant(1, dtype=dtype, shape=len( huggingface_tokenizer.all_special_ids)), key_dtype=dtype, value_dtype=dtype), default_value=tf.constant(0, dtype=dtype)) tokenizer_tf_text = tf_text.BertTokenizer( vocab_lookup_table=vocab_lookup_table, suffix_indicator="##", max_bytes_per_word=huggingface_tokenizer.wordpiece_tokenizer. max_input_chars_per_word, max_chars_per_token=None, token_out_type=dtype, unknown_token=huggingface_tokenizer.unk_token, split_unknown_characters=True, lower_case=True, keep_whitespace=False, normalization_form=None, preserve_unused_token=True) return tokenizer_tf_text, vocab_lookup_table, special_ids_mask_table
def __init__(self, *, vocab_file: str, lower_case: bool, tokenize_with_offsets: bool = False, **kwargs): """Initialize a BertTokenizer layer. Args: vocab_file: A Python string with the path of the vocabulary file. This is a text file with newline-separated wordpiece tokens. This layer initializes a lookup table from it that gets used with text.BertTokenizer. lower_case: A Python boolean forwarded to text.BertTokenizer. If true, input text is converted to lower case (where applicable) before tokenization. This must be set to match the way in which the vocab_file was created. tokenize_with_offsets: A Python boolean. If true, this layer calls BertTokenizer.tokenize_with_offsets() instead of plain .tokenize() and outputs a triple of (tokens, start_offsets, limit_offsets) insead of just tokens. **kwargs: standard arguments to Layer(). Raises: ImportError: if importing tensorflow_text failed. """ _check_if_tf_text_installed() self.tokenize_with_offsets = tokenize_with_offsets # TODO(b/177326279): Stop storing the vocab table initializer as an # attribute when https://github.com/tensorflow/tensorflow/issues/46293 # has been fixed in the TensorFlow versions of the TF Hub users that load # a SavedModel created from this layer. Due to that issue, loading such a # SavedModel forgets to add .vocab_table._initializer as a trackable # dependency of .vocab_table, so that saving it again to a second SavedModel # (e.g., the final model built using TF Hub) does not properly track # the ._vocab_table._initializer._filename as an Asset. self._vocab_table, self._vocab_initializer_donotuse = ( self._create_vocab_table_and_initializer(vocab_file)) self._special_tokens_dict = self._create_special_tokens_dict( self._vocab_table, vocab_file) super().__init__(**kwargs) self._bert_tokenizer = text.BertTokenizer(self._vocab_table, lower_case=lower_case)
def __init__(self, reserved_tokens, vocab_path): self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True) self._reserved_tokens = reserved_tokens self._vocab_path = tf.saved_model.Asset(vocab_path) vocab = pathlib.Path(vocab_path).read_text().splitlines() self.vocab = tf.Variable(vocab) self.tokenize.get_concrete_function(tf.TensorSpec(shape=[None], dtype=tf.string)) self.detokenize.get_concrete_function(tf.TensorSpec(shape=[None, None], dtype=tf.int64)) self.detokenize.get_concrete_function(tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64)) self.lookup.get_concrete_function(tf.TensorSpec(shape=[None, None], dtype=tf.int64)) self.lookup.get_concrete_function(tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64)) self.get_reserved_tokens.get_concrete_function() self.get_vocab_path.get_concrete_function() self.get_vocab_size.get_concrete_function()
def classify(tweet, model): tokenizer = tf_text.BertTokenizer('vocab.txt') input = tokenizer.tokenize([tweet]) input = input.merge_dims(-2, -1) input = tf.keras.preprocessing.sequence.pad_sequences(input.to_list(), padding="post", maxlen=params['MAX_LEN']) prediction = model(input, training=False).numpy() if prediction[0][0] > .5: sentiment = 'positivo' value = prediction[0][0] * 100 else: sentiment = 'negativo' value = (1 - prediction[0][0]) * 100 return value, sentiment
def get_tf_tokenizer(module_handle): """Creates a preprocessing function.""" tokenization_info = get_tokenization_info(module_handle) table_initializer = tf.lookup.TextFileInitializer( filename=tokenization_info["vocab_file"], key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE, value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER) vocab_lookup_table = tf.lookup.StaticVocabularyTable( initializer=table_initializer, num_oov_buckets=1, lookup_key_dtype=tf.string) tokenizer = tf_text.BertTokenizer( vocab_lookup_table=vocab_lookup_table, lower_case=tokenization_info["do_lower_case"]) return tokenizer, vocab_lookup_table
def tokenize_single_sentence_unpad(self, sequence: tf.Tensor, max_len: int = 128, add_cls: bool = True, add_sep: bool = True): """Tokenize a sentence with the BERT model vocab file and without padding. Add special tokens according to config. Args: sequence: Tensor of shape [batch_size, 1]. max_len: The number of tokens after padding and truncating. add_cls: Whether to add CLS token at the front of each sequence. add_sep: Whether to add SEP token at the end of each sequence. Returns: word_ids: Ragged tokenized sequences [batch_size, None]. """ vocab_file_path = self._model.resolved_object.vocab_file.asset_path tokenizer = text.BertTokenizer( vocab_file_path, lower_case=self._do_lower_case, token_out_type=tf.int64) word_ids = tokenizer.tokenize(sequence) # Tokenizer default puts tokens into array of size 1. merge_dims flattens it word_ids = word_ids.merge_dims(-2, -1) if add_cls: cls_token = tf.fill([tf.shape(sequence)[0], 1], tf.constant(self._cls_id, dtype=tf.int64)) word_ids = tf.concat([cls_token, word_ids], 1) if add_sep: sep_token = tf.fill([tf.shape(sequence)[0], 1], tf.constant(self._sep_id, dtype=tf.int64)) word_ids = word_ids[:, :max_len - 1] word_ids = tf.concat([word_ids, sep_token], 1) return word_ids
# Commented out IPython magic to ensure Python compatibility. # %%time pada_vocab = bert_vocab.bert_vocab_from_dataset( train_pada.batch(1000).prefetch(2), **bert_vocab_args) print(pada_vocab[:10]) print(pada_vocab[100:110]) print(pada_vocab[1000:1010]) print(pada_vocab[-10:]) write_vocab_file(work_dir + 'pada_vocab.txt', pada_vocab) # !ls *.txt samh_tokenizer = text.BertTokenizer(work_dir + 'samh_vocab.txt', **bert_tokenizer_params) pada_tokenizer = text.BertTokenizer(work_dir + 'pada_vocab.txt', **bert_tokenizer_params) for smah_examples, pada_examples in train_examples.batch(3).take(1): for ex in pada_examples: print(ex.numpy()) # Tokenize the examples -> (batch, word, word-piece) token_batch = pada_tokenizer.tokenize(pada_examples) # Merge the word and word-piece axes -> (batch, tokens) token_batch = token_batch.merge_dims(-2, -1) # for ex in token_batch.to_list(): # print(ex)
def preprocessing_fn(inputs): """Preprocess input column of text into transformed columns of. * input token ids * input mask * input type ids """ CLS_ID = tf.constant(101, dtype=tf.int64) SEP_ID = tf.constant(102, dtype=tf.int64) PAD_ID = tf.constant(0, dtype=tf.int64) vocab_file_path = load_bert_layer().resolved_object.vocab_file.asset_path bert_tokenizer = text.BertTokenizer(vocab_lookup_table=vocab_file_path, token_out_type=tf.int64, lower_case=do_lower_case) def tokenize_text(text, sequence_length=MAX_SEQ_LEN): """ Perform the BERT preprocessing from text -> input token ids """ # convert text into token ids tokens = bert_tokenizer.tokenize(text) # flatten the output ragged tensors tokens = tokens.merge_dims(1, 2)[:, :sequence_length] # Add start and end token ids to the id sequence start_tokens = tf.fill([tf.shape(text)[0], 1], CLS_ID) end_tokens = tf.fill([tf.shape(text)[0], 1], SEP_ID) tokens = tokens[:, :sequence_length - 2] tokens = tf.concat([start_tokens, tokens, end_tokens], axis=1) # truncate sequences greater than MAX_SEQ_LEN tokens = tokens[:, :sequence_length] # pad shorter sequences with the pad token id tokens = tokens.to_tensor(default_value=PAD_ID) pad = sequence_length - tf.shape(tokens)[1] tokens = tf.pad(tokens, [[0, 0], [0, pad]], constant_values=PAD_ID) # and finally reshape the word token ids to fit the output # data structure of TFT return tf.reshape(tokens, [-1, sequence_length]) def preprocess_bert_input(text): """ Convert input text into the input_word_ids, input_mask, input_type_ids """ input_word_ids = tokenize_text(text) input_mask = tf.cast(input_word_ids > 0, tf.int64) input_mask = tf.reshape(input_mask, [-1, MAX_SEQ_LEN]) zeros_dims = tf.stack(tf.shape(input_mask)) input_type_ids = tf.fill(zeros_dims, 0) input_type_ids = tf.cast(input_type_ids, tf.int64) return (input_word_ids, input_mask, input_type_ids) input_word_ids, input_mask, input_type_ids = \ preprocess_bert_input(tf.squeeze(inputs['text'], axis=1)) return { 'input_word_ids': input_word_ids, 'input_mask': input_mask, 'input_type_ids': input_type_ids, 'label': inputs['label'] }
# skaffer vokabularet vi har laget bert_tokenizer_params = dict(lower_case=True) reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"] bert_vocab_args = dict( # maksimum størrelse for vokabularet vocab_size=8000 * 7, # Reserverte orddeler som må være med reserved_tokens=reserved_tokens, # flere argumenter bert_tokenizer_params=bert_tokenizer_params, learn_params={}, ) # lager en "tokenizer", som deler tekst opp i orddeler tokenizer = text.BertTokenizer('vocab.txt', **bert_tokenizer_params) tokenlist = open('vocab.txt', 'r', encoding="utf-8").readlines() # IDen til padding PAD_ID = 0 # Maksimum lengde for vektoren # hvis vektoren er mindre, blir det lagt til padding max_seq_len = 20 langs = 7 def preprocess_bert_input(text): # finner IDene til alle orddelene i inputtet ids = tokenize_text(text, max_seq_len) # lager en mask, som i dette tilfettet representerer lengden på vektoren vår
def __init__(self, bert_layer, max_len, min_len=1, CLS='[CLS]', SEP='[SEP]', PAD='[PAD]', UNK='[UNK]'): """ Initializes the layer :param CLS Token that represents the start of a sentence :param SEP Token that represents the end of a segment :param PAD Token that represents padding :param UNK Token that represents unknown tokens :param bert_layer Keras layer that loaded from pretrained BERT """ super().__init__() self._CLS = CLS self._SEP = SEP self._PAD = PAD self._min_len = min_len self._max_len = max_len resolved_object = bert_layer.resolved_object self.do_lower_case = resolved_object.do_lower_case.numpy() if hasattr(resolved_object, "tokenizer_type"): tokenizer_type_file = resolved_object.tokenizer_type.asset_path.numpy( ).decode("utf-8") with tf.io.gfile.GFile(tokenizer_type_file, 'r') as f_handler: self._tokenizer_type = f_handler.read().strip() tokenizer_file = resolved_object.tokenizer_file.asset_path.numpy( ).decode("utf-8") if self._tokenizer_type == SENTENCEPIECE: with tf.io.gfile.GFile(tokenizer_file, 'rb') as f_handler: sp_model = f_handler.read() self._tokenizer = tf_text.SentencepieceTokenizer( model=sp_model, out_type=tf.int32) self.vocab_table = create_tf_vocab_from_sp_tokenizer( self._tokenizer, num_oov_buckets=1) else: assert (self._tokenizer_type == SPACE) _, self.vocab_table = read_tf_vocab(tokenizer_file, UNK) else: vocab_file = resolved_object.vocab_file.asset_path.numpy().decode( "utf-8") _, self.vocab_table = create_tf_vocab_from_wp_tokenizer( vocab_file, num_oov_buckets=1) self._tokenizer = tf_text.BertTokenizer( self.vocab_table, token_out_type=tf.int64, lower_case=self.do_lower_case, unknown_token=UNK) self._tokenizer_type = WORDPIECE self._pad_id = self.vocab_table.lookup(tf.constant(PAD)) if PAD else -1 self._cls_id = self.vocab_table.lookup(tf.constant(CLS)) if CLS else -1 self._sep_id = self.vocab_table.lookup(tf.constant(SEP)) if SEP else -1 if self._tokenizer_type == SENTENCEPIECE: self._pad_id = tf.cast(self._pad_id, tf.int32) self._cls_id = tf.cast(self._cls_id, tf.int32) self._sep_id = tf.cast(self._sep_id, tf.int32)
batch_data) #, padding=True, truncation=True) end = time.time() print( "The throughput of huggingface python tokenizer: {:,.2f} tokens/s".format( (total_tokens / (end - start)))) # BERT Tokenizer using TensorFlow Text vocab_list = list(py_tokenizer.vocab.token_to_idx.keys()) lookup_table = tf.lookup.StaticVocabularyTable( tf.lookup.KeyValueTensorInitializer(keys=vocab_list, key_dtype=tf.string, values=tf.range(tf.size( vocab_list, out_type=tf.int64), dtype=tf.int64), value_dtype=tf.int64), num_oov_buckets=1) tf_tokenizer = tf_text.BertTokenizer(lookup_table) for batch_data in batches: input_ids = tf_tokenizer.tokenize(batch_data) start = time.time() for _ in range(epochs): for batch_data in batches: input_ids = tf_tokenizer.tokenize(batch_data) end = time.time() print( "The throughput of TensorFlow Text BertTokenizer: {:,.2f} tokens/s".format( (total_tokens / (end - start))))