Exemple #1
0
 def load_from_file(cls, filename_prefix):
     with tf.io.gfile.GFile(f'{filename_prefix}.encoder') as f:
         params = json.load(f)
     bert_tokenizer = tokenizers.BertTokenizer(
         vocab_file=params['vocab_file'],
         do_lower_case=params['do_lower_case'])
     bert_encoder = BertTextEncoder(
         tokenizer=bert_tokenizer,
         max_seq_length=params['max_seq_length'],
         include_title=params['include_title'],
         include_sentence_id=params['include_sentence_id'])
     return bert_encoder
Exemple #2
0
    def _build_tokenizer(self):
        """Build the correct tokenizer depending on model encoder.

    Returns:
      Tokenizer for model
    """
        if self._model_config.tokenizer == 'basic':
            base_tokenizer = tfds.deprecated.text.Tokenizer()
            return tokenizers.ReservedTokenizer(
                tokenizer=base_tokenizer,
                reserved_re=preprocessing.SEPARATOR_RE)
        elif self._model_config.tokenizer == 'bert':
            return tokenizers.BertTokenizer(
                vocab_file=self._model_config.bert_vocab_path,
                do_lower_case=True)
        else:
            raise ValueError('Invalid tokenizer')