def __init__( self, vocab_file, do_lower_case=True, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", **kwargs ): """ :param vocab_file: Path to a one-word-per-line vocabulary file :type vocab_file: str :param do_lower_case: Flag whether to lower case the input :type do_lower_case: bool """ # TODO check why EmbeddingTokenizer.tokenize gives many UNK, while tokenize_with_metadata() works fine super().__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs, ) if not os.path.isfile(vocab_file): raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file)) self.vocab = load_vocab(vocab_file) self.unk_tok_idx = self.vocab[unk_token] self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.do_lower_case = do_lower_case
def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, do_char_tokenize=False, do_wordpiece_tokenize=False, do_preprocessing = True, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): super(BertTokenizer, self).__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) self.do_wordpiece_tokenize = do_wordpiece_tokenize self.do_lower_case = do_lower_case self.vocab_file = vocab_file self.do_basic_tokenize = do_basic_tokenize self.do_char_tokenize = do_char_tokenize self.unk_token = unk_token self.do_preprocessing = do_preprocessing if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'.".format(vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict( [(ids, tok) for tok, ids in self.vocab.items()]) if do_wordpiece_tokenize: self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) self.base_bert_tok = BertTokenizer(vocab_file=self.vocab_file, do_lower_case=do_lower_case, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs)
def _load_bert_vocab(self, vocab_path, namespace): vocab: Dict[str, int] = load_vocab(vocab_path) for word, idx in vocab.items(): try: self._token_to_index[namespace][word] = idx self._index_to_token[namespace][idx] = word except: print(word, type(word), idx) raise
def from_config(cls, config: Config): basic_tokenizer = create_component( ComponentType.TOKENIZER, config.basic_tokenizer ) vocab = load_vocab(config.wordpiece_vocab_path) wordpiece_tokenizer = WordpieceTokenizer( vocab=vocab, unk_token="[UNK]" ) # UNK is for compatibility with HF v0.5 return cls(vocab, basic_tokenizer, wordpiece_tokenizer)
def __init__(self, vocab_file, do_lower_case=False, do_basic_tokenize=True, do_wordpiece_tokenize=True, mecab_dict_path=None, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): """Constructs a MecabBertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. Only has an effect when do_basic_tokenize=True. **do_basic_tokenize**: (`optional`) boolean (default True) Whether to do basic tokenization with MeCab before wordpiece. **mecab_dict_path**: (`optional`) string Path to a directory of a MeCab dictionary. """ super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'.".format( vocab_file)) self.vocab = load_vocab(vocab_file) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens self.ids_to_tokens = collections.OrderedDict([ (ids, tok) for tok, ids in self.vocab.items() ]) self.do_basic_tokenize = do_basic_tokenize self.do_wordpiece_tokenize = do_wordpiece_tokenize if do_basic_tokenize: self.basic_tokenizer = MecabBasicTokenizer( do_lower_case=do_lower_case, mecab_dict_path=mecab_dict_path) if do_wordpiece_tokenize: self.wordpiece_tokenizer = WordpieceTokenizer( vocab=self.vocab, unk_token=self.unk_token)
def __init__( self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs ): super().__init__( vocab_file, do_lower_case, do_basic_tokenize, never_split, unk_token, sep_token, pad_token, cls_token, mask_token, tokenize_chinese_chars, **kwargs ) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) ) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.do_basic_tokenize = do_basic_tokenize if do_basic_tokenize: self.basic_tokenizer = RawRsvBasicTokenizer( do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars ) self.wordpiece_tokenizer = RawRsvWordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
def _wordpiece(self, text, unit="text"): """ ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld'] """ if self.subword_tokenizer is None: vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True) vocab = load_vocab(vocab_path) self.subword_tokenizer = WordpieceTokenizer( vocab, unk_token=self.config.get("unk_token", "[UNK]")) tokens = [] if unit == "word": for sub_token in self.subword_tokenizer.tokenize(text): tokens.append(sub_token) else: for token in self.word_tokenizer.tokenize(text): for sub_token in self.subword_tokenizer.tokenize(token): tokens.append(sub_token) return tokens