def load_from_file(cls, filename_prefix): with tf.io.gfile.GFile(f'{filename_prefix}.encoder') as f: params = json.load(f) bert_tokenizer = tokenizers.BertTokenizer( vocab_file=params['vocab_file'], do_lower_case=params['do_lower_case']) bert_encoder = BertTextEncoder( tokenizer=bert_tokenizer, max_seq_length=params['max_seq_length'], include_title=params['include_title'], include_sentence_id=params['include_sentence_id']) return bert_encoder
def _build_tokenizer(self): """Build the correct tokenizer depending on model encoder. Returns: Tokenizer for model """ if self._model_config.tokenizer == 'basic': base_tokenizer = tfds.deprecated.text.Tokenizer() return tokenizers.ReservedTokenizer( tokenizer=base_tokenizer, reserved_re=preprocessing.SEPARATOR_RE) elif self._model_config.tokenizer == 'bert': return tokenizers.BertTokenizer( vocab_file=self._model_config.bert_vocab_path, do_lower_case=True) else: raise ValueError('Invalid tokenizer')