def on_config_ready(self, **kwargs):
     super().on_config_ready(**kwargs)
     if 'albert_chinese' in self.config.transformer:
         self.transformer_tokenizer = BertTokenizer.from_pretrained(
             self.config.transformer, use_fast=True)
     else:
         self.transformer_tokenizer = AutoTokenizer.from_pretrained(
             self.config.transformer, use_fast=True)
Example #2
0
 def build_transformer_tokenizer(self):
     transformer = self.config.transformer
     if transformer:
         transformer_tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
             transformer, use_fast=True)
     else:
         transformer_tokenizer = None
     self.transformer_tokenizer = transformer_tokenizer
     return transformer_tokenizer
Example #3
0
 def build_transformer_tokenizer(
         config_or_str,
         use_fast=True,
         do_basic_tokenize=True) -> PreTrainedTokenizer:
     if isinstance(config_or_str, str):
         transformer = config_or_str
     else:
         transformer = config_or_str.transformer
     if use_fast and not do_basic_tokenize:
         warnings.warn(
             '`do_basic_tokenize=False` might not work when `use_fast=True`'
         )
     return AutoTokenizer.from_pretrained(
         transformer,
         use_fast=use_fast,
         do_basic_tokenize=do_basic_tokenize)
Example #4
0
 def __init__(self,
              tokenizer: Union[PreTrainedTokenizer, str],
              text_a_key: str,
              text_b_key: str = None,
              output_key=None,
              max_seq_length=512, truncate_long_sequences=True) -> None:
     super().__init__(max_seq_length, truncate_long_sequences)
     self.text_b = text_b_key
     self.text_a = text_a_key
     if output_key is None:
         output_key = self.text_a
         if text_b_key:
             output_key += '_' + text_b_key
     if output_key == '':
         output_key = self._KEY
     else:
         output_key = [f'{output_key}_{key}' for key in self._KEY]
     self.output_key = output_key
     if isinstance(tokenizer, str):
         tokenizer = AutoTokenizer.from_pretrained(tokenizer)
     self.tokenizer = tokenizer
    def __init__(
        self,
        tokenizer: Union[PreTrainedTokenizer, str],
        input_key,
        output_key=None,
        max_seq_length=512,
        truncate_long_sequences=False,
        config: PretrainedConfig = None,
        cls_token_at_end=False,
        cls_token_segment_id=0,
        pad_token_segment_id=0,
        pad_on_left=False,
        do_padding=False,
        sep_token_extra=False,
        ret_mask_and_type=False,
        ret_prefix_mask=False,
        ret_token_span=True,
        ret_subtokens=False,
        ret_subtokens_group=False,
        cls_is_bos=False,
        sep_is_eos=False,
        do_basic_tokenize=True,
        use_fast=True,
        dict_force=None,
        strip_cls_sep=True,
        check_space_before=None,
    ) -> None:
        """A transformer tokenizer for token-level tasks. It honors the boundary of tokens and tokenize each token into
        several subtokens then merge them. The information about each subtoken belongs to which token are kept and
        returned as a new field in the sample. It also provides out-of-box sliding window trick on long sequences.

        Args:
            tokenizer: The identifier of a pre-trained tokenizer or a ``PreTrainedTokenizer``.
            input_key: The token key in samples.
            output_key: The output keys to store results.
                max_seq_length: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            truncate_long_sequences: ``True`` to truncate exceeded parts of long sequences. ``False`` to  enable
                sliding window.
            config: The ``PretrainedConfig`` to determine the model structure of the transformer, so that special
                tokenization can be applied.
            cls_token_at_end: ``True`` to put ``[CLS]`` at the end of input tokens.
            cls_token_segment_id: The id of ``[CLS]``.
            pad_token_segment_id: The id of ``[SEP]``.
            pad_on_left: ``True`` to put ``[PAD]`` at the left side of input tokens.
            do_padding: ``True`` to pad sequence to the left.
            sep_token_extra: ``True`` to have two ``[SEP]``.
            ret_mask_and_type: ``True`` to return masks and type ids.
            ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token.
            ret_token_span: ``True`` to return span of each token measured by subtoken offsets.
            ret_subtokens: ``True`` to return list of subtokens belonging to each token for tokenization purpose.
                When enabled, the prefix mask for each subtoken is set to True as each subtoken is a token unit in
                tokenization task. Similarity, the token span for each token will be a continuous integer sequence.
            ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token.
            cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is.
                        ``False`` (default) means the first token is not [CLS], it will have its own embedding other than
                        the embedding of [CLS].
            sep_is_eos: ``True`` means the last token of input is [SEP].
                        ``False`` means it's not but [SEP] will be appended,
                        ``None`` means it dependents on `input[-1] == [EOS]`.
            do_basic_tokenize: Whether to do basic tokenization before wordpiece.
            use_fast: Whether or not to try to load the fast version of the tokenizer.
            dict_force: A dictionary doing longest-prefix-match on input text so that the head and tail of each keyword
                won't be concatenated to other tokens by transformer tokenizers.
            strip_cls_sep: ``True`` to strip [CLS] and [SEP] off the input tokens.
            check_space_before: ``True`` to detect the space before each token to handle underline in sentence piece
                tokenization.

        Examples:

        .. highlight:: python
        .. code-block:: python

            transform = TransformerSequenceTokenizer('bert-base-uncased', 'token')
            sample = {'token': 'HanLP good'.split()}
            print(transform(sample))

        """
        super().__init__(max_seq_length, truncate_long_sequences)
        tokenizer_name = tokenizer if isinstance(
            tokenizer, str) else tokenizer.name_or_path
        if check_space_before is None:
            # These tokenizer is BPE-based which appends a space before each token and tokenizes loving into
            # ['▁lo', 'ving'], tokenize 商品 into ['▁', '商品']. For the later case, the prefix '▁' has to be removed
            # as there is no space between some languages like Chinese
            check_space_before = tokenizer_name in ('xlm-roberta-base',
                                                    'xlm-roberta-large',
                                                    'google/mt5-small',
                                                    'google/mt5-base')
        self.check_space_before = check_space_before
        self.ret_subtokens_group = ret_subtokens_group
        self.ret_subtokens = ret_subtokens
        self.sep_is_eos = sep_is_eos
        self.ret_prefix_mask = ret_prefix_mask
        self.ret_mask_and_type = ret_mask_and_type
        self.cls_is_bos = cls_is_bos
        self.ret_token_span = ret_token_span
        if not output_key or isinstance(output_key, str):
            suffixes = ['input_ids']
            if ret_mask_and_type:
                suffixes += 'attention_mask', 'token_type_ids'
            if ret_prefix_mask:
                suffixes += ['prefix_mask']
            if ret_token_span:
                suffixes.append('token_span')
            if output_key is None:
                output_key = [f'{input_key}_{key}' for key in suffixes]
            elif output_key == '':
                output_key = suffixes
            else:
                output_key = [f'{output_key}_{key}' for key in suffixes]

        self.input_key = input_key
        self.output_key = output_key
        if config:
            xlnet = config_is(config, 'xlnet')
            pad_token_segment_id = 4 if xlnet else 0
            cls_token_segment_id = 2 if xlnet else 0
            cls_token_at_end = xlnet
            pad_on_left = xlnet
        if isinstance(tokenizer, str):
            tokenizer = AutoTokenizer.from_pretrained(
                tokenizer,
                use_fast=use_fast,
                do_basic_tokenize=do_basic_tokenize)
        if use_fast:
            # Dirty fix upstream bug: https://github.com/hankcs/HanLP/issues/1602
            if hasattr(tokenizer, '_tokenizer') and hasattr(
                    tokenizer._tokenizer, 'no_truncation'):
                _t = tokenizer._tokenizer
                _t.no_truncation()
                _t.no_padding()
                _t.no_truncation = _t.no_padding = lambda: None
        pad_token = tokenizer.pad_token
        self.pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0]
        self.pad_token_segment_id = pad_token_segment_id
        if tokenizer_name in ('google/mt5-small', 'google/mt5-base'):
            # mt5 doesn't have cls or sep, but we can use something similar
            self.has_cls = False
            self.cls_token = '▁'
            self.cls_token_id = tokenizer.convert_tokens_to_ids(self.cls_token)
            self.sep_token = tokenizer.eos_token
            self.sep_token_id = tokenizer.eos_token_id
        else:
            self.has_cls = True
            self.cls_token = tokenizer.cls_token
            self.sep_token = tokenizer.sep_token
            self.cls_token_segment_id = cls_token_segment_id
            self.cls_token_id = tokenizer.cls_token_id
            self.sep_token_id = tokenizer.sep_token_id

        self.sep_token_extra = sep_token_extra
        self.cls_token_at_end = cls_token_at_end
        self.tokenizer = tokenizer
        self.pad_on_left = pad_on_left
        self.do_padding = do_padding
        if self.ret_token_span or not self.truncate_long_sequences:
            assert not self.cls_token_at_end
            assert not self.pad_on_left
        if self.ret_subtokens:
            if not use_fast:
                raise NotImplementedError(
                    'ret_subtokens is not available when using Python tokenizers. '
                    'To use this feature, set use_fast = True.')
        self.dict: Optional[
            DictInterface] = dict_force  # For tokenization of raw text
        self.strip_cls_sep = strip_cls_sep
def main():
    transformer = 'bert-base-uncased'
    tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(transformer)
    # _test_text_transform(tokenizer)
    _test_sequence_transform(tokenizer)