Ejemplo n.º 1
0
    def _tokenize(self, text, never_split=None, **kwargs):
        if self.do_preprocessing:
            if self.do_lower_case:
                text = text.lower()
            text = str(" ".join(text_processor.pre_process_doc(text)))
            text = re.sub(r'[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]', ' ',
                          text)
            text = re.sub(r'\s+', ' ', text)
            text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
            text = re.sub(r'^\s', '', text)
            text = re.sub(r'\s$', '', text)
            # print(s)

        split_tokens = [text]
        if self.do_wordpiece_tokenize:
            wordpiece_tokenizer = WordpieceTokenizer(self.vocab,
                                                     self.unk_token)
            split_tokens = wordpiece_tokenizer.tokenize(text)

        elif self.do_char_tokenize:
            tokenizer = CharacterTokenizer(self.vocab, self.unk_token)
            split_tokens = tokenizer.tokenize(text)

        elif self.do_basic_tokenize:
            """Tokenizes a piece of text."""
            split_tokens = self.base_bert_tok.tokenize(text)

        return split_tokens
Ejemplo n.º 2
0
    def __init__(self,
                 vocab_path,
                 do_lower_case=True,
                 max_len=None,
                 freq_path=None):
        """Constructs a BertTokenizer.

        Args:
          vocab_file: Path to a one-wordpiece-per-line vocabulary file
          max_len: An artificial maximum length to truncate tokenized sequences to;
                         Effective maximum length is always the minimum of this
                         value (if specified) and the underlying BERT model's
                         sequence length.
        """
        self.token_to_idx = json.load(open(vocab_path, 'r'),
                                      object_pairs_hook=OrderedDict)
        self.idx_to_token = OrderedDict([
            (idx, tok) for tok, idx in self.token_to_idx.items()
        ])
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.token_to_idx)
        self.max_len = max_len if max_len is not None else int(1e12)

        if freq_path is not None:
            self.token_to_freq = json.load(open(freq_path, 'r'),
                                           object_pairs_hook=OrderedDict)
    def __init__(self,
                 vocab_file,
                 do_lower_case=False,
                 do_basic_tokenize=True,
                 do_wordpiece_tokenize=True,
                 mecab_dict_path=None,
                 unk_token='[UNK]',
                 sep_token='[SEP]',
                 pad_token='[PAD]',
                 cls_token='[CLS]',
                 mask_token='[MASK]',
                 **kwargs):
        """Constructs a MecabBertTokenizer.

        Args:
            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
                Only has an effect when do_basic_tokenize=True.
            **do_basic_tokenize**: (`optional`) boolean (default True)
                Whether to do basic tokenization with MeCab before wordpiece.
            **mecab_dict_path**: (`optional`) string
                Path to a directory of a MeCab dictionary.
        """
        super(BertTokenizer, self).__init__(unk_token=unk_token,
                                            sep_token=sep_token,
                                            pad_token=pad_token,
                                            cls_token=cls_token,
                                            mask_token=mask_token,
                                            **kwargs)

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'.".format(
                    vocab_file))

        self.vocab = load_vocab(vocab_file)

        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens

        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])
        self.do_basic_tokenize = do_basic_tokenize
        self.do_wordpiece_tokenize = do_wordpiece_tokenize
        if do_basic_tokenize:
            self.basic_tokenizer = MecabBasicTokenizer(
                do_lower_case=do_lower_case, mecab_dict_path=mecab_dict_path)

        if do_wordpiece_tokenize:
            self.wordpiece_tokenizer = WordpieceTokenizer(
                vocab=self.vocab, unk_token=self.unk_token)
Ejemplo n.º 4
0
    def __init__(self, vocab_file, do_lower_case=True,
                 do_basic_tokenize=True, do_char_tokenize=False, do_wordpiece_tokenize=False, do_preprocessing = True, unk_token='[UNK]',
                 sep_token='[SEP]',
                 pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
        super(BertTokenizer, self).__init__(
            unk_token=unk_token, sep_token=sep_token, pad_token=pad_token,
            cls_token=cls_token, mask_token=mask_token, **kwargs)

        self.do_wordpiece_tokenize = do_wordpiece_tokenize
        self.do_lower_case = do_lower_case
        self.vocab_file = vocab_file
        self.do_basic_tokenize = do_basic_tokenize
        self.do_char_tokenize = do_char_tokenize
        self.unk_token = unk_token
        self.do_preprocessing = do_preprocessing

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'.".format(vocab_file))

        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict(
            [(ids, tok) for tok, ids in self.vocab.items()])

        if do_wordpiece_tokenize:
            self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
                                                          unk_token=self.unk_token)
            
        self.base_bert_tok = BertTokenizer(vocab_file=self.vocab_file, do_lower_case=do_lower_case,
                                      unk_token=unk_token, sep_token=sep_token, pad_token=pad_token,
                                      cls_token=cls_token, mask_token=mask_token, **kwargs)
Ejemplo n.º 5
0
    def _wordpiece(self, text, unit="text"):
        """
        ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld']
        """
        if self.subword_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True)
            vocab = load_vocab(vocab_path)
            self.subword_tokenizer = WordpieceTokenizer(
                vocab, unk_token=self.config.get("unk_token", "[UNK]"))

        tokens = []

        if unit == "word":
            for sub_token in self.subword_tokenizer.tokenize(text):
                tokens.append(sub_token)
        else:
            for token in self.word_tokenizer.tokenize(text):
                for sub_token in self.subword_tokenizer.tokenize(token):
                    tokens.append(sub_token)

        return tokens
Ejemplo n.º 6
0
class SubwordTokenizer(Tokenizer):
    """
    Subword Tokenizer

    text -> [word tokens] -> [[sub word tokens], ...]

    * Args:
        name: tokenizer name [wordpiece]
    """

    def __init__(self, name, word_tokenizer, config={}):
        super(SubwordTokenizer, self).__init__(name, f"subword-{name}+{word_tokenizer.cache_name}")
        self.data_handler = DataHandler(CachePath.VOCAB)
        self.config = config
        self.word_tokenizer = word_tokenizer
        self.subword_tokenizer = None

    """ Tokenizers """

    def _wordpiece(self, text, unit="text"):
        """
        ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld']
        """
        if self.subword_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True)
            vocab = load_vocab(vocab_path)
            self.subword_tokenizer = WordpieceTokenizer(
                vocab, unk_token=self.config.get("unk_token", "[UNK]"))

        tokens = []

        if unit == "word":
            for sub_token in self.subword_tokenizer.tokenize(text):
                tokens.append(sub_token)
        else:
            for token in self.word_tokenizer.tokenize(text):
                for sub_token in self.subword_tokenizer.tokenize(token):
                    tokens.append(sub_token)

        return tokens
def make_alignment(tokenizer: transformers.WordpieceTokenizer,
                   tokens: List[str]) -> Tuple[List[str], List[List[int]]]:
    """ Make the alignment between tokens and the subtokens. It is
    useful to interpret results or to understand the model reasoning. """
    i = 0
    sub_tokens = []
    alignment = []
    for token in tokens:

        indices = []
        word_pieces = tokenizer.tokenize(token)
        for sub_token in word_pieces:
            indices.append(i)
            sub_tokens.append(sub_token)
            i += 1

        alignment.append(indices)
    return sub_tokens, alignment
Ejemplo n.º 8
0
    def extend_bert_vocab(self, words_to_extend):
        # print(all_words)

        init_len = len(self.tokenizer.vocab)
        cur_ind = init_len
        for i in words_to_extend:
            if i in self.tokenizer.vocab:
                continue
            self.tokenizer.vocab[i] = cur_ind
            cur_ind += 1

        print(f"extend bert tokenizer with extra {cur_ind - init_len} words!")
        self.tokenizer.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.tokenizer.vocab.items()
        ])
        self.tokenizer.wordpiece_tokenizer = WordpieceTokenizer(
            vocab=self.tokenizer.vocab, unk_token=self.tokenizer.unk_token)
        self.encoder._resize_token_embeddings(cur_ind)
Ejemplo n.º 9
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument."
        )

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name"
        )
    # Dirty hack to add NK vocab to our tokenizer
    # From: https://github.com/deepset-ai/FARM/issues/157
    from collections import OrderedDict
    from transformers import BertTokenizer, WordpieceTokenizer
    
    with open('jobert-vocab.txt', 'r', encoding='utf8') as fp:
        vocab = fp.read().splitlines()
    tokens_to_add = [token for token in vocab if not (token in tokenizer.vocab or token in tokenizer.all_special_tokens)]
    tokenizer.vocab = OrderedDict([
        *tokenizer.vocab.items(),
        *[
            (token, i + len(tokenizer.vocab))
            for i, token in enumerate(tokens_to_add)
        ]
    ])
    tokenizer.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in tokenizer.vocab.items()])
    tokenizer.wordpiece_tokenizer = WordpieceTokenizer(vocab=tokenizer.vocab, unk_token=tokenizer.unk_token)
        
    if model_args.model_name_or_path:
        model = AutoModelWithLMHead.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the"
            "--mlm flag (masked language modeling)."
        )

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets

    train_dataset = (
        get_dataset(data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None
    )
    eval_dataset = (
        get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir)
        if training_args.do_eval
        else None
    )
    if config.model_type == "xlnet":
        data_collator = DataCollatorForPermutationLanguageModeling(
            tokenizer=tokenizer,
            plm_probability=data_args.plm_probability,
            max_span_length=data_args.max_span_length,
        )
    else:
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
        )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (
            model_args.model_name_or_path
            if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
            else None
        )
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results
Ejemplo n.º 10
0
    def __init__(self,
                 vocab_file,
                 do_lower_case=False,
                 do_word_tokenize=True,
                 do_subword_tokenize=True,
                 word_tokenizer_type="basic",
                 subword_tokenizer_type="wordpiece",
                 never_split=None,
                 unk_token="[UNK]",
                 sep_token="[SEP]",
                 pad_token="[PAD]",
                 cls_token="[CLS]",
                 mask_token="[MASK]",
                 mecab_kwargs=None,
                 **kwargs):
        """Constructs a MecabBertTokenizer.

        Args:
            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
                Only has an effect when do_basic_tokenize=True.
            **do_word_tokenize**: (`optional`) boolean (default True)
                Whether to do word tokenization.
            **do_subword_tokenize**: (`optional`) boolean (default True)
                Whether to do subword tokenization.
            **word_tokenizer_type**: (`optional`) string (default "basic")
                Type of word tokenizer.
            **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
                Type of subword tokenizer.
            **mecab_kwargs**: (`optional`) dict passed to `MecabTokenizer` constructor (default None)
        """
        super(BertTokenizer, self).__init__(
            unk_token='<unk>' if word_tokenizer_type == 'sp' else '[UNK]',
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            do_lower_case=do_lower_case,
            do_word_tokenize=do_word_tokenize,
            do_subword_tokenize=do_subword_tokenize,
            word_tokenizer_type=word_tokenizer_type,
            subword_tokenizer_type=subword_tokenizer_type,
            never_split=never_split,
            mecab_kwargs=mecab_kwargs,
            **kwargs,
        )
        # ^^ We call the grandparent's init, not the parent's.

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                .format(vocab_file))
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])

        self.do_word_tokenize = do_word_tokenize
        self.word_tokenizer_type = word_tokenizer_type
        self.lower_case = do_lower_case
        self.never_split = never_split
        self.mecab_kwargs = copy.deepcopy(mecab_kwargs)
        if do_word_tokenize:
            if word_tokenizer_type == "basic":
                self.word_tokenizer = BasicTokenizer(
                    do_lower_case=do_lower_case,
                    never_split=never_split,
                    tokenize_chinese_chars=False)
            elif word_tokenizer_type == "mecab":
                self.word_tokenizer = MecabTokenizer(
                    do_lower_case=do_lower_case,
                    never_split=never_split,
                    **(mecab_kwargs or {}))
            elif word_tokenizer_type == "sp":
                path_vocab = Path(vocab_file)
                self.word_tokenizer = SentencePiecepTokenizer(
                    model_file=str(path_vocab.parent / path_vocab.stem) +
                    '.model',
                    do_lower_case=do_lower_case,
                    never_split=never_split,
                    **(mecab_kwargs or {}))
            else:
                raise ValueError(
                    "Invalid word_tokenizer_type '{}' is specified.".format(
                        word_tokenizer_type))

        self.do_subword_tokenize = do_subword_tokenize
        self.subword_tokenizer_type = subword_tokenizer_type
        if do_subword_tokenize:
            if subword_tokenizer_type == "wordpiece":
                self.subword_tokenizer = WordpieceTokenizer(
                    vocab=self.vocab, unk_token=self.unk_token)
            elif subword_tokenizer_type == "character":
                self.subword_tokenizer = CharacterTokenizer(
                    vocab=self.vocab, unk_token=self.unk_token)
            else:
                raise ValueError(
                    "Invalid subword_tokenizer_type '{}' is specified.".format(
                        subword_tokenizer_type))
Ejemplo n.º 11
0
class WordPieceVocab(object):
    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
    def __init__(self,
                 vocab_path,
                 do_lower_case=True,
                 max_len=None,
                 freq_path=None):
        """Constructs a BertTokenizer.

        Args:
          vocab_file: Path to a one-wordpiece-per-line vocabulary file
          max_len: An artificial maximum length to truncate tokenized sequences to;
                         Effective maximum length is always the minimum of this
                         value (if specified) and the underlying BERT model's
                         sequence length.
        """
        self.token_to_idx = json.load(open(vocab_path, 'r'),
                                      object_pairs_hook=OrderedDict)
        self.idx_to_token = OrderedDict([
            (idx, tok) for tok, idx in self.token_to_idx.items()
        ])
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.token_to_idx)
        self.max_len = max_len if max_len is not None else int(1e12)

        if freq_path is not None:
            self.token_to_freq = json.load(open(freq_path, 'r'),
                                           object_pairs_hook=OrderedDict)

    def tokenize(self, text):
        split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens

    def detokenize(self, tokens):
        text = ' '.join(tokens)
        return text.replace(' ##', '')

    def to_input_tensor(self, sents: List[List[str]], device) -> torch.Tensor:
        """ Convert list of tokens into tensor with necessary padding for
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tesnor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size)
        """
        sents = [self.convert_tokens_to_idx(sent) for sent in sents]
        sents, mask = self.pad_sentences(sents)
        sents_var = torch.tensor(sents, dtype=torch.long, device=device)
        mask_var = torch.tensor(mask, dtype=torch.long, device=device)
        return sents_var, mask_var

    def from_output_tensor(self, batch_output):
        """ Places batch output on cpu and converts it to tokens ignoring -1's and padding.
        args:
            batch_output    (tensor)   (batch_size, max_len)
        """
        place_on_cpu(batch_output)
        sents = []
        for output in batch_output:
            sent = []
            for idx in output:
                idx = idx.item()
                if idx == -1:
                    continue

                token = self.idx_to_token[idx]

                if token == "[PAD]":
                    continue

                sent.append(token)
            sents.append(sent)
        return sents

    def pad_sentences(self, sents):
        """
        args:
            sents   (list(list(str)))
        """
        sents_padded = []
        mask_padded = []

        max_len = max(map(len, sents))
        for sent in sents:
            sents_padded.append(sent[:] + [self.token_to_idx['[PAD]']] *
                                (max_len - len(sent)))

        mask = [[int(token != self.token_to_idx['[PAD]']) for token in sent]
                for sent in sents_padded]

        return sents_padded, mask

    def wrap_sentence(self, sent):
        """ Wrap sentences with start and stop tokens.
        args:
            sent (list[str]])
        """
        sent = ['[CLS]'] + sent + ['[SEP]']

        return sent

    def unwrap_sentence(self, tokens):
        new_tokens = [
            token for token in tokens if token != '[CLS]' and token != '[SEP]'
        ]
        return new_tokens

    def convert_tokens_to_idx(self, tokens):
        """Converts a sequence of tokens into ids using the vocab."""
        ids = []
        for token in tokens:
            ids.append(self.token_to_idx[token])
        if len(ids) > self.max_len:
            logging.warning(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this BERT model ({} > {}). Running this"
                " sequence through BERT will result in indexing errors".format(
                    len(ids), self.max_len))
        return ids

    def convert_idxs_to_token(self, ids):
        """Converts a sequence of ids in wordpiece tokens using the vocab."""
        tokens = []
        for i in ids:
            tokens.append(self.idx_to_token[i])
        return tokens

    def get_tokens_in_range(self, tokens, text, start, end):
        """
        Get all of the tokens in the range (start, end) in original string.
        """
        token_idxs = []
        find_start = 0

        for idx, token in enumerate(tokens):
            if token == "[CLS]" or token == "[SEP]":
                continue

            if token.startswith("##"):
                # remove pounds
                token = token[2:]

            token_start = text.find(token, find_start)
            token_end = token_start + len(token)
            find_start = token_end

            if ((token_start >= start and token_start < end)
                    or (token_end >= start and token_end < end)):
                token_idxs.append(idx)
        return token_idxs

    def __len__(self):
        """ Compute number of words in VocabEntry.
        @returns len (int): number of words in VocabEntry
        """
        return len(self.token_to_idx)
Ejemplo n.º 12
0
class VinaBertTokenizer(BertTokenizer):
    """BERT tokenizer for Vietnamese text; underthesea tokenization + WordPiece"""
    def __init__(self,
                 vocab_file,
                 do_lower_case=False,
                 do_basic_tokenize=True,
                 do_wordpiece_tokenize=True,
                 vina_dict_path=None,
                 unk_token='[UNK]',
                 sep_token='[SEP]',
                 pad_token='[PAD]',
                 cls_token='[CLS]',
                 mask_token='[MASK]',
                 **kwargs):
        """Constructs a underthesea BertTokenizer.

        Args:
            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
                Only has an effect when do_basic_tokenize=True.
            **do_basic_tokenize**: (`optional`) boolean (default True)
                Whether to do basic tokenization with underthesea before wordpiece.
            **vina_dict_path**: (`optional`) string
                Path to a directory of a underthesea dictionary.
        """
        super(BertTokenizer, self).__init__(unk_token=unk_token,
                                            sep_token=sep_token,
                                            pad_token=pad_token,
                                            cls_token=cls_token,
                                            mask_token=mask_token,
                                            **kwargs)

        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'.".format(
                    vocab_file))

        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])
        self.do_basic_tokenize = do_basic_tokenize
        self.do_wordpiece_tokenize = do_wordpiece_tokenize
        if do_basic_tokenize:
            self.basic_tokenizer = VinaBasicTokenizer(
                do_lower_case=do_lower_case, vina_dict_path=vina_dict_path)

        if do_wordpiece_tokenize:
            self.wordpiece_tokenizer = WordpieceTokenizer(
                vocab=self.vocab, unk_token=self.unk_token)

    def _tokenize(self, text):
        if self.do_basic_tokenize:
            tokens = self.basic_tokenizer.tokenize(
                text, never_split=self.all_special_tokens)
        else:
            tokens = [text]

        if self.do_wordpiece_tokenize:
            split_tokens = [
                sub_token for token in tokens
                for sub_token in self.wordpiece_tokenizer.tokenize(token)
            ]
        else:
            split_tokens = tokens

        return split_tokens