Example #1
0
    def __init__(
            self,
            vocab_file,
            do_lower_case=True,
            unk_token="[UNK]",
            sep_token="[SEP]",
            pad_token="[PAD]",
            cls_token="[CLS]",
            mask_token="[MASK]",
            **kwargs
    ):
        """
        :param vocab_file: Path to a one-word-per-line vocabulary file
        :type vocab_file: str
        :param do_lower_case: Flag whether to lower case the input
        :type do_lower_case: bool
        """
        # TODO check why EmbeddingTokenizer.tokenize gives many UNK, while tokenize_with_metadata() works fine

        super().__init__(
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs,
        )

        if not os.path.isfile(vocab_file):
            raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file))
        self.vocab = load_vocab(vocab_file)
        self.unk_tok_idx = self.vocab[unk_token]
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        self.do_lower_case = do_lower_case
Example #2
0
    def __init__(self, vocab_file, do_lower_case=True,
                 do_basic_tokenize=True, do_char_tokenize=False, do_wordpiece_tokenize=False, do_preprocessing = True, unk_token='[UNK]',
                 sep_token='[SEP]',
                 pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
        super(BertTokenizer, self).__init__(
            unk_token=unk_token, sep_token=sep_token, pad_token=pad_token,
            cls_token=cls_token, mask_token=mask_token, **kwargs)

        self.do_wordpiece_tokenize = do_wordpiece_tokenize
        self.do_lower_case = do_lower_case
        self.vocab_file = vocab_file
        self.do_basic_tokenize = do_basic_tokenize
        self.do_char_tokenize = do_char_tokenize
        self.unk_token = unk_token
        self.do_preprocessing = do_preprocessing

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'.".format(vocab_file))

        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict(
            [(ids, tok) for tok, ids in self.vocab.items()])

        if do_wordpiece_tokenize:
            self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
                                                          unk_token=self.unk_token)
            
        self.base_bert_tok = BertTokenizer(vocab_file=self.vocab_file, do_lower_case=do_lower_case,
                                      unk_token=unk_token, sep_token=sep_token, pad_token=pad_token,
                                      cls_token=cls_token, mask_token=mask_token, **kwargs)
Example #3
0
 def _load_bert_vocab(self, vocab_path, namespace):
     vocab: Dict[str, int] = load_vocab(vocab_path)
     for word, idx in vocab.items():
         try:
             self._token_to_index[namespace][word] = idx
             self._index_to_token[namespace][idx] = word
         except:
             print(word, type(word), idx)
             raise
Example #4
0
 def from_config(cls, config: Config):
     basic_tokenizer = create_component(
         ComponentType.TOKENIZER, config.basic_tokenizer
     )
     vocab = load_vocab(config.wordpiece_vocab_path)
     wordpiece_tokenizer = WordpieceTokenizer(
         vocab=vocab, unk_token="[UNK]"
     )  # UNK is for compatibility with HF v0.5
     return cls(vocab, basic_tokenizer, wordpiece_tokenizer)
    def __init__(self,
                 vocab_file,
                 do_lower_case=False,
                 do_basic_tokenize=True,
                 do_wordpiece_tokenize=True,
                 mecab_dict_path=None,
                 unk_token='[UNK]',
                 sep_token='[SEP]',
                 pad_token='[PAD]',
                 cls_token='[CLS]',
                 mask_token='[MASK]',
                 **kwargs):
        """Constructs a MecabBertTokenizer.

        Args:
            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
                Only has an effect when do_basic_tokenize=True.
            **do_basic_tokenize**: (`optional`) boolean (default True)
                Whether to do basic tokenization with MeCab before wordpiece.
            **mecab_dict_path**: (`optional`) string
                Path to a directory of a MeCab dictionary.
        """
        super(BertTokenizer, self).__init__(unk_token=unk_token,
                                            sep_token=sep_token,
                                            pad_token=pad_token,
                                            cls_token=cls_token,
                                            mask_token=mask_token,
                                            **kwargs)

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'.".format(
                    vocab_file))

        self.vocab = load_vocab(vocab_file)

        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens

        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])
        self.do_basic_tokenize = do_basic_tokenize
        self.do_wordpiece_tokenize = do_wordpiece_tokenize
        if do_basic_tokenize:
            self.basic_tokenizer = MecabBasicTokenizer(
                do_lower_case=do_lower_case, mecab_dict_path=mecab_dict_path)

        if do_wordpiece_tokenize:
            self.wordpiece_tokenizer = WordpieceTokenizer(
                vocab=self.vocab, unk_token=self.unk_token)
    def __init__(
            self,
            vocab_file,
            do_lower_case=True,
            do_basic_tokenize=True,
            never_split=None,
            unk_token="[UNK]",
            sep_token="[SEP]",
            pad_token="[PAD]",
            cls_token="[CLS]",
            mask_token="[MASK]",
            tokenize_chinese_chars=True,
            **kwargs
    ):
        super().__init__(
            vocab_file,
            do_lower_case,
            do_basic_tokenize,
            never_split,
            unk_token,
            sep_token,
            pad_token,
            cls_token,
            mask_token,
            tokenize_chinese_chars,
            **kwargs
        )
        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
            )
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        self.do_basic_tokenize = do_basic_tokenize
        if do_basic_tokenize:
            self.basic_tokenizer = RawRsvBasicTokenizer(
                do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars
            )
        self.wordpiece_tokenizer = RawRsvWordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
Example #7
0
    def _wordpiece(self, text, unit="text"):
        """
        ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld']
        """
        if self.subword_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True)
            vocab = load_vocab(vocab_path)
            self.subword_tokenizer = WordpieceTokenizer(
                vocab, unk_token=self.config.get("unk_token", "[UNK]"))

        tokens = []

        if unit == "word":
            for sub_token in self.subword_tokenizer.tokenize(text):
                tokens.append(sub_token)
        else:
            for token in self.word_tokenizer.tokenize(text):
                for sub_token in self.subword_tokenizer.tokenize(token):
                    tokens.append(sub_token)

        return tokens