Example #1
0
def build_transformer(transformer,
                      max_seq_length,
                      num_labels,
                      tagging=True,
                      tokenizer_only=False):
    tokenizer = AutoTokenizer_.from_pretrained(transformer)
    if tokenizer_only:
        return tokenizer
    l_bert = TFAutoModel.from_pretrained(transformer)
    l_input_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                        dtype='int32',
                                        name="input_ids")
    l_mask_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                       dtype='int32',
                                       name="mask_ids")
    l_token_type_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                             dtype='int32',
                                             name="token_type_ids")
    output = l_bert(input_ids=l_input_ids,
                    token_type_ids=l_token_type_ids,
                    attention_mask=l_mask_ids).last_hidden_state
    if not tagging:
        output = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
    logits = tf.keras.layers.Dense(num_labels)(output)
    model = tf.keras.Model(inputs=[l_input_ids, l_mask_ids, l_token_type_ids],
                           outputs=logits)
    model.build(input_shape=(None, max_seq_length))
    return model, tokenizer
Example #2
0
 def on_config_ready(self, transformer, max_seq_len, **kwargs):
     super().on_config_ready(**kwargs)
     self._tokenizer = TransformerTextTokenizer(
         AutoTokenizer_.from_pretrained(transformer),
         text_a_key='sent_a',
         text_b_key='sent_b',
         output_key='',
         max_seq_length=max_seq_len)
Example #3
0
 def on_config_ready(self, **kwargs):
     super().on_config_ready(**kwargs)
     if 'albert_chinese' in self.config.transformer:
         self.transformer_tokenizer = BertTokenizer.from_pretrained(
             self.config.transformer, use_fast=True)
     else:
         self.transformer_tokenizer = AutoTokenizer_.from_pretrained(
             self.config.transformer, use_fast=True)
 def __init__(self,
              tokenizer: Union[PreTrainedTokenizer, str],
              text_a_key: str,
              text_b_key: str = None,
              output_key=None,
              max_seq_length=512, truncate_long_sequences=True) -> None:
     super().__init__(max_seq_length, truncate_long_sequences)
     self.text_b = text_b_key
     self.text_a = text_a_key
     if output_key is None:
         output_key = self.text_a
         if text_b_key:
             output_key += '_' + text_b_key
     if output_key == '':
         output_key = self._KEY
     else:
         output_key = [f'{output_key}_{key}' for key in self._KEY]
     self.output_key = output_key
     if isinstance(tokenizer, str):
         tokenizer = AutoTokenizer_.from_pretrained(tokenizer)
     self.tokenizer = tokenizer
    def __init__(self,
                 tokenizer: Union[PreTrainedTokenizer, str],
                 input_key,
                 output_key=None,
                 max_seq_length=512,
                 truncate_long_sequences=False,
                 config: PretrainedConfig = None,
                 cls_token_at_end=False,
                 cls_token_segment_id=0,
                 pad_token_segment_id=0,
                 pad_on_left=False,
                 do_padding=False,
                 sep_token_extra=False,
                 ret_mask_and_type=False,
                 ret_prefix_mask=False,
                 ret_token_span=True,
                 ret_subtokens=False,
                 ret_subtokens_group=False,
                 cls_is_bos=False,
                 sep_is_eos=False,
                 do_basic_tokenize=True,
                 use_fast=True,
                 dict_force=None,
                 strip_cls_sep=True,
                 check_space_before=None,
                 ) -> None:
        """A transformer tokenizer for token-level tasks. It honors the boundary of tokens and tokenize each token into
        several subtokens then merge them. The information about each subtoken belongs to which token are kept and
        returned as a new field in the sample. It also provides out-of-box sliding window trick on long sequences.

        Args:
            tokenizer: The identifier of a pre-trained tokenizer or a ``PreTrainedTokenizer``.
            input_key: The token key in samples.
            output_key: The output keys to store results.
                max_seq_length: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            truncate_long_sequences: ``True`` to truncate exceeded parts of long sequences. ``False`` to  enable
                sliding window.
            config: The ``PretrainedConfig`` to determine the model structure of the transformer, so that special
                tokenization can be applied.
            cls_token_at_end: ``True`` to put ``[CLS]`` at the end of input tokens.
            cls_token_segment_id: The id of ``[CLS]``.
            pad_token_segment_id: The id of ``[SEP]``.
            pad_on_left: ``True`` to put ``[PAD]`` at the left side of input tokens.
            do_padding: ``True`` to pad sequence to the left.
            sep_token_extra: ``True`` to have two ``[SEP]``.
            ret_mask_and_type: ``True`` to return masks and type ids.
            ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token.
            ret_token_span: ``True`` to return span of each token measured by subtoken offsets.
            ret_subtokens: ``True`` to return list of subtokens belonging to each token for tokenization purpose.
                When enabled, the prefix mask for each subtoken is set to True as each subtoken is a token unit in
                tokenization task. Similarity, the token span for each token will be a continuous integer sequence.
            ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token.
            cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is.
                        ``False`` (default) means the first token is not [CLS], it will have its own embedding other than
                        the embedding of [CLS].
            sep_is_eos: ``True`` means the last token of input is [SEP].
                        ``False`` means it's not but [SEP] will be appended,
                        ``None`` means it dependents on `input[-1] == [EOS]`.
            do_basic_tokenize: Whether to do basic tokenization before wordpiece.
            use_fast: Whether or not to try to load the fast version of the tokenizer.
            dict_force: A dictionary doing longest-prefix-match on input text so that the head and tail of each keyword
                won't be concatenated to other tokens by transformer tokenizers.
            strip_cls_sep: ``True`` to strip [CLS] and [SEP] off the input tokens.
            check_space_before: ``True`` to detect the space before each token to handle underline in sentence piece
                tokenization.

        Examples:

        .. highlight:: python
        .. code-block:: python

            transform = TransformerSequenceTokenizer('bert-base-uncased', 'token')
            sample = {'token': 'HanLP good'.split()}
            print(transform(sample))

        """
        super().__init__(max_seq_length, truncate_long_sequences)
        tokenizer_name = tokenizer if isinstance(tokenizer, str) else tokenizer.name_or_path
        if check_space_before is None:
            # These tokenizer is BPE-based which appends a space before each token and tokenizes loving into
            # ['▁lo', 'ving'], tokenize 商品 into ['▁', '商品']. For the later case, the prefix '▁' has to be removed
            # as there is no space between some languages like Chinese
            check_space_before = tokenizer_name in ('xlm-roberta-base', 'xlm-roberta-large', 'google/mt5-small',
                                                    'google/mt5-base')
        self.check_space_before = check_space_before
        self.ret_subtokens_group = ret_subtokens_group
        self.ret_subtokens = ret_subtokens
        self.sep_is_eos = sep_is_eos
        self.ret_prefix_mask = ret_prefix_mask
        self.ret_mask_and_type = ret_mask_and_type
        self.cls_is_bos = cls_is_bos
        self.ret_token_span = ret_token_span
        if not output_key or isinstance(output_key, str):
            suffixes = ['input_ids']
            if ret_mask_and_type:
                suffixes += 'attention_mask', 'token_type_ids'
            if ret_prefix_mask:
                suffixes += ['prefix_mask']
            if ret_token_span:
                suffixes.append('token_span')
            if output_key is None:
                output_key = [f'{input_key}_{key}' for key in suffixes]
            elif output_key == '':
                output_key = suffixes
            else:
                output_key = [f'{output_key}_{key}' for key in suffixes]

        self.input_key = input_key
        self.output_key = output_key
        if config:
            xlnet = config_is(config, 'xlnet')
            pad_token_segment_id = 4 if xlnet else 0
            cls_token_segment_id = 2 if xlnet else 0
            cls_token_at_end = xlnet
            pad_on_left = xlnet
        if isinstance(tokenizer, str):
            tokenizer = AutoTokenizer_.from_pretrained(tokenizer, use_fast=use_fast,
                                                       do_basic_tokenize=do_basic_tokenize)
        if use_fast:
            # Dirty fix upstream bug: https://github.com/hankcs/HanLP/issues/1602
            if hasattr(tokenizer, '_tokenizer') and hasattr(tokenizer._tokenizer, 'no_truncation'):
                _t = tokenizer._tokenizer
                _t.no_truncation()
                _t.no_padding()
                _t.no_truncation = _t.no_padding = lambda: None
        pad_token = tokenizer.pad_token
        self.pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0]
        self.pad_token_segment_id = pad_token_segment_id
        if tokenizer_name in ('google/mt5-small', 'google/mt5-base'):
            # mt5 doesn't have cls or sep, but we can use something similar
            self.has_cls = False
            self.cls_token = '▁'
            self.cls_token_id = tokenizer.convert_tokens_to_ids(self.cls_token)
            self.sep_token = tokenizer.eos_token
            self.sep_token_id = tokenizer.eos_token_id
        else:
            self.has_cls = True
            self.cls_token = tokenizer.cls_token
            self.sep_token = tokenizer.sep_token
            self.cls_token_segment_id = cls_token_segment_id
            self.cls_token_id = tokenizer.cls_token_id
            self.sep_token_id = tokenizer.sep_token_id

        self.sep_token_extra = sep_token_extra
        self.cls_token_at_end = cls_token_at_end
        self.tokenizer = tokenizer
        self.pad_on_left = pad_on_left
        self.do_padding = do_padding
        if self.ret_token_span or not self.truncate_long_sequences:
            assert not self.cls_token_at_end
            assert not self.pad_on_left
        # if self.ret_subtokens:
        #     if not use_fast:
        #         raise NotImplementedError(
        #             'ret_subtokens is not available when using Python tokenizers. '
        #             'To use this feature, set use_fast = True.')
        self.dict: Optional[DictInterface] = dict_force  # For tokenization of raw text
        self.strip_cls_sep = strip_cls_sep
def main():
    transformer = 'bert-base-uncased'
    tokenizer: PreTrainedTokenizer = AutoTokenizer_.from_pretrained(transformer)
    # _test_text_transform(tokenizer)
    _test_sequence_transform(tokenizer)
Example #7
0
 def build_transformer_tokenizer(config_or_str, use_fast=True, do_basic_tokenize=True) -> PreTrainedTokenizer:
     return AutoTokenizer_.from_pretrained(config_or_str, use_fast, do_basic_tokenize)
    def __init__(self,
                 field: str,
                 transformer: str,
                 average_subwords=False,
                 scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None,
                 word_dropout: Optional[Union[float, Tuple[float,
                                                           str]]] = None,
                 max_sequence_length=None,
                 truncate_long_sequences=False,
                 cls_is_bos=False,
                 sep_is_eos=False,
                 ret_token_span=True,
                 ret_subtokens=False,
                 ret_subtokens_group=False,
                 ret_prefix_mask=False,
                 ret_raw_hidden_states=False,
                 transformer_args: Dict[str, Any] = None,
                 use_fast=True,
                 do_basic_tokenize=True,
                 trainable=True) -> None:
        """A contextual word embedding builder which builds a
        :class:`~hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbeddingModule` and a
        :class:`~hanlp.transform.transformer_tokenizer.TransformerSequenceTokenizer`.

        Args:
            field: The field to work on. Usually some token fields.
            transformer:  An identifier of a ``PreTrainedModel``.
            average_subwords: ``True`` to average subword representations.
            scalar_mix: Layer attention.
            word_dropout: Dropout rate of randomly replacing a subword with MASK.
            max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding
                window.
            truncate_long_sequences: ``True`` to return hidden states of each layer.
            cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is.
                        ``False`` (default) means the first token is not [CLS], it will have its own embedding other than
                        the embedding of [CLS].
            sep_is_eos: ``True`` means the last token of input is [SEP].
                        ``False`` means it's not but [SEP] will be appended,
                        ``None`` means it dependents on `input[-1] == [EOS]`.
            ret_token_span: ``True`` to return span of each token measured by subtoken offsets.
            ret_subtokens: ``True`` to return list of subtokens belonging to each token.
            ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token.
            ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token.
            ret_raw_hidden_states: ``True`` to return hidden states of each layer.
            transformer_args: Extra arguments passed to the transformer.
            use_fast: Whether or not to try to load the fast version of the tokenizer.
            do_basic_tokenize: Whether to do basic tokenization before wordpiece.
            trainable: ``False`` to use static embeddings.
        """
        super().__init__()
        self.truncate_long_sequences = truncate_long_sequences
        self.transformer_args = transformer_args
        self.trainable = trainable
        self.ret_subtokens_group = ret_subtokens_group
        self.ret_subtokens = ret_subtokens
        self.ret_raw_hidden_states = ret_raw_hidden_states
        self.sep_is_eos = sep_is_eos
        self.cls_is_bos = cls_is_bos
        self.max_sequence_length = max_sequence_length
        self.word_dropout = word_dropout
        self.scalar_mix = scalar_mix
        self.average_subwords = average_subwords
        self.transformer = transformer
        self.field = field
        self._transformer_tokenizer = AutoTokenizer_.from_pretrained(
            self.transformer,
            use_fast=use_fast,
            do_basic_tokenize=do_basic_tokenize)
        self._tokenizer_transform = TransformerSequenceTokenizer(
            self._transformer_tokenizer,
            field,
            truncate_long_sequences=truncate_long_sequences,
            ret_prefix_mask=ret_prefix_mask,
            ret_token_span=ret_token_span,
            cls_is_bos=cls_is_bos,
            sep_is_eos=sep_is_eos,
            ret_subtokens=ret_subtokens,
            ret_subtokens_group=ret_subtokens_group,
            max_seq_length=self.max_sequence_length)
Example #9
0
File: mlm.py Project: lei1993/HanLP
 def load_vocabs(self, save_dir, filename='vocabs.json'):
     self.tokenizer = AutoTokenizer_.from_pretrained(
         self.config.transformer)