Python FullTokenizer Examples, felix.tokenization.FullTokenizer Python Examples

Example #1

0

Show file

File: utils_test.py Project: zizhuang99/google-research

    def setUp(self):
        super(UtilsTest, self).setUp()

        vocab_tokens = [
            'NOTHING', '[CLS]', '[SEP]', '[MASK]', '[PAD]', 'a', 'b', 'c',
            '##d', 'd', '##e'
        ]
        vocab_file = self.create_tempfile()
        vocab_file.write_text(''.join([x + '\n' for x in vocab_tokens]))
        self._tokenizer = tokenization.FullTokenizer(vocab_file.full_path,
                                                     do_lower_case=True)

Example #2

0

Show file

File: bert_example.py Project: MitchellTesla/google-research

    def __init__(self,
                 label_map,
                 max_seq_length,
                 do_lower_case,
                 converter,
                 use_open_vocab,
                 vocab_file=None,
                 converter_insertion=None,
                 special_glue_string_for_sources=None):
        """Initializes an instance of BertExampleBuilder.

    Args:
      label_map: Mapping from tags to tag IDs.
      max_seq_length: Maximum sequence length.
      do_lower_case: Whether to lower case the input text. Should be True for
        uncased models and False for cased models.
      converter: Converter from text targets to points.
      use_open_vocab: Should MASK be inserted or phrases. Currently only True is
        supported.
      vocab_file: Path to BERT vocabulary file.
      converter_insertion: Converter for building an insertion example based on
        the tagger output. Optional.
      special_glue_string_for_sources: If there are multiple sources, this
        string is used to combine them into one string. The empty string is a
        valid value. Optional.
    """
        self.label_map = label_map
        inverse_label_map = {}
        for label, label_id in label_map.items():
            if label_id in inverse_label_map:
                raise ValueError(
                    'Multiple labels with the same ID: {}'.format(label_id))
            inverse_label_map[label_id] = label
        self._inverse_label_map = frozendict.frozendict(inverse_label_map)
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file, do_lower_case=do_lower_case)
        self._max_seq_length = max_seq_length
        self._converter = converter
        self._pad_id = self._get_pad_id()
        self._do_lower_case = do_lower_case
        self._use_open_vocab = use_open_vocab
        self._converter_insertion = converter_insertion
        if special_glue_string_for_sources is not None:
            self._special_glue_string_for_sources = special_glue_string_for_sources
        else:
            self._special_glue_string_for_sources = ' '

Example #3

0

Show file

File: example_builder_for_felix_insert.py Project: MitchellTesla/google-research

  def __init__(self,
               label_map,
               vocab_file,
               do_lower_case,
               max_seq_length,
               max_predictions_per_seq,
               max_insertions_per_token,
               insert_after_token = True,
               special_glue_string_for_sources = None):
    """Initializes an instance of BertExampleBuilder.

    Args:
      label_map: Mapping from tags represented as (base_tag, num_insertions)
        tuples to tag IDs.
      vocab_file: Path to vocab file.
      do_lower_case: should text be lowercased.
      max_seq_length: Maximum sequence length.
      max_predictions_per_seq: Maximum number of tokens to insert per input.
      max_insertions_per_token: Maximum number of tokens/masks to insert per
        token.
      insert_after_token: Whether to insert tokens after the current token.
      special_glue_string_for_sources: If there are multiple sources, this
        string is used to combine them into one string. The empty string is a
        valid value. Optional.
    """
    self.label_map = label_map
    self.tokenizer = tokenization.FullTokenizer(
        vocab_file, do_lower_case=do_lower_case)
    self._max_seq_length = max_seq_length
    self._max_predictions_per_seq = max_predictions_per_seq
    self._max_insertions_per_token = max_insertions_per_token
    self._insert_after_token = insert_after_token
    try:
      self._pad_id = self.tokenizer.convert_tokens_to_ids([constants.PAD])[0]
    except KeyError:
      self._pad_id = 0
    if special_glue_string_for_sources is not None:
      self._special_glue_string_for_sources = special_glue_string_for_sources
    else:
      self._special_glue_string_for_sources = ' '

Example #4

0

Show file

    def __init__(self,
                 max_seq_length,
                 max_predictions_per_seq,
                 label_map,
                 vocab_file=None,
                 do_lower_case=True,
                 fall_back_mode='random'):
        """Initializes an instance of InsertionConverter.

    Args:
      max_seq_length: Maximum length of source sequence.
      max_predictions_per_seq: Maximum number of MASK tokens.
      label_map: Dictionary to convert labels_ids to labels.
      vocab_file: Path to BERT vocabulary file.
      do_lower_case: text is lowercased.
      fall_back_mode: In the case no MASK tokens are generated:
                      'random': Randomly add MASK tokens.
                      'force':  Leave the output unchanged (not recommended).
                        Otherwise return None and terminate early (saving
                        computation time).
    """

        self._max_seq_length = max_seq_length
        self._max_predictions_per_seq = max_predictions_per_seq
        self._tokenizer = tokenization.FullTokenizer(
            vocab_file, do_lower_case=do_lower_case)
        self._label_map = label_map
        self._label_map_inverse = {v: k for k, v in self._label_map.items()}
        if fall_back_mode.lower() == 'random':
            self._do_random_mask = True
            self._do_lazy_generation = False
        elif fall_back_mode.lower() == 'force':
            self._do_random_mask = False
            self._do_lazy_generation = False
        else:
            self._do_random_mask = False
            self._do_lazy_generation = True