Exemple #1
0
    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        do_basic_tokenize=True,
        never_split=None,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        super().__init__(
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            never_split=never_split,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained "
                "model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])
        self.do_basic_tokenize = do_basic_tokenize
        if do_basic_tokenize:
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
                                                      unk_token=self.unk_token)
        try:
            import rjieba
        except ImportError:
            raise ImportError(
                "You need to install rjieba to use RoFormerTokenizer. "
                "See https://pypi.org/project/rjieba/ for installation.")
        self.jieba = rjieba
Exemple #2
0
    def __init__(self,
                 vocab_file,
                 do_lower_case=True,
                 do_basic_tokenize=True,
                 do_char_tokenize=False,
                 do_wordpiece_tokenize=False,
                 do_preprocessing=True,
                 unk_token='[UNK]',
                 sep_token='[SEP]',
                 pad_token='[PAD]',
                 cls_token='[CLS]',
                 mask_token='[MASK]',
                 **kwargs):
        super(BertTokenizer, self).__init__(unk_token=unk_token,
                                            sep_token=sep_token,
                                            pad_token=pad_token,
                                            cls_token=cls_token,
                                            mask_token=mask_token,
                                            **kwargs)

        self.do_wordpiece_tokenize = do_wordpiece_tokenize
        self.do_lower_case = do_lower_case
        self.vocab_file = vocab_file
        self.do_basic_tokenize = do_basic_tokenize
        self.do_char_tokenize = do_char_tokenize
        self.unk_token = unk_token
        self.do_preprocessing = do_preprocessing

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'.".format(
                    vocab_file))

        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])

        if do_wordpiece_tokenize:
            self.wordpiece_tokenizer = WordpieceTokenizer(
                vocab=self.vocab, unk_token=self.unk_token)

        self.base_bert_tok = BertTokenizer(vocab_file=self.vocab_file,
                                           do_lower_case=do_lower_case,
                                           unk_token=unk_token,
                                           sep_token=sep_token,
                                           pad_token=pad_token,
                                           cls_token=cls_token,
                                           mask_token=mask_token,
                                           **kwargs)
Exemple #3
0
    def __init__(self,
                 vocab_file,
                 do_lower_case=True,
                 unk_token="[UNK]",
                 sep_token="[SEP]",
                 pad_token="[PAD]",
                 cls_token="[CLS]",
                 mask_token="[MASK]",
                 **kwargs):
        """
        :param vocab_file: Path to a one-word-per-line vocabulary file
        :type vocab_file: str
        :param do_lower_case: Flag whether to lower case the input
        :type do_lower_case: bool
        """
        # TODO check why EmbeddingTokenizer.tokenize gives many UNK, while tokenize_with_metadata() works fine

        super().__init__(
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs,
        )

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'.".format(
                    vocab_file))
        self.vocab = load_vocab(vocab_file)
        self.unk_tok_idx = self.vocab[unk_token]
        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])
        self.do_lower_case = do_lower_case