def test_wordpiece_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i tokenizer = WordpieceTokenizer(vocab=vocab) self.assertListEqual(tokenizer.tokenize(""), []) self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
class SubwordTokenizer(Tokenizer): """ Subword Tokenizer text -> [word tokens] -> [[sub word tokens], ...] * Args: name: tokenizer name [wordpiece] """ def __init__(self, name, word_tokenizer, config={}): super(SubwordTokenizer, self).__init__(name, f"subword-{name}+{word_tokenizer.cache_name}") self.data_handler = DataHandler(CachePath.VOCAB) self.config = config self.word_tokenizer = word_tokenizer self.subword_tokenizer = None """ Tokenizers """ def _wordpiece(self, text, unit="text"): """ ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld'] """ if self.subword_tokenizer is None: vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True) vocab = load_vocab(vocab_path) self.subword_tokenizer = WordpieceTokenizer(vocab) tokens = [] if unit == "word": for sub_token in self.subword_tokenizer.tokenize(text): tokens.append(sub_token) else: for token in self.word_tokenizer.tokenize(text): for sub_token in self.subword_tokenizer.tokenize(token): tokens.append(sub_token) return tokens
class BertLabelTokenizer: """Runs end-to-end tokenization: punctuation splitting + wordpiece""" def __init__(self, vocab_file, max_len=None, do_basic_tokenize=True, never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): """Constructs a BertTokenizer. Args: vocab_file: Path to a one-wordpiece-per-line vocabulary file do_basic_tokenize: Whether to do basic tokenization before wordpiece. max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying BERT model's sequence length. never_split: List of tokens which will never be split during tokenization. Only has an effect when do_wordpiece_only=False """ if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" .format(vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict([ (ids, tok) for tok, ids in self.vocab.items() ]) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.max_len = max_len if max_len is not None else int(1e12) def tokenize(self, text): split_tokens = [] token_begin_mask = [] for token in text: wordpieces = self.wordpiece_tokenizer.tokenize(token) if len(wordpieces) > 0: for sub_token in wordpieces: split_tokens.append(sub_token) token_begin_mask += [1] + [0] * (len(wordpieces) - 1) return split_tokens, token_begin_mask def tokenize_labels(self, text, labels): split_tokens = [] split_labels = [] token_begin_mask = [] for token, label in zip(text, labels): wordpieces = self.wordpiece_tokenizer.tokenize(token) if len(wordpieces) > 0: for sub_token in wordpieces: split_tokens.append(sub_token) split_labels += [label] + ["X"] * (len(wordpieces) - 1) token_begin_mask += [1] + [0] * (len(wordpieces) - 1) return split_tokens, split_labels, token_begin_mask def convert_tokens_to_ids(self, tokens): """Converts a sequence of tokens into ids using the vocab.""" ids = [] for token in tokens: ids.append(self.vocab[token]) if len(ids) > self.max_len: logger.warning( "Token indices sequence length is longer than the specified maximum " " sequence length for this BERT model ({} > {}). Running this" " sequence through BERT will result in indexing errors".format( len(ids), self.max_len)) return ids def convert_ids_to_tokens(self, ids): """Converts a sequence of ids in wordpiece tokens using the vocab.""" tokens = [] for i in ids: tokens.append(self.ids_to_tokens[i]) return tokens @classmethod def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): """ Instantiate a PreTrainedBertModel from a pre-trained model file. Download and cache the pre-trained model file if needed. """ if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[ pretrained_model_name_or_path] else: vocab_file = pretrained_model_name_or_path if os.path.isdir(vocab_file): vocab_file = os.path.join(vocab_file, VOCAB_NAME) # redirect to the cache, if necessary try: resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) except EnvironmentError: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find any file " "associated to this path or url.".format( pretrained_model_name_or_path, ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), vocab_file)) return None if resolved_vocab_file == vocab_file: logger.info("loading vocabulary file {}".format(vocab_file)) else: logger.info("loading vocabulary file {} from cache at {}".format( vocab_file, resolved_vocab_file)) if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: # if we're using a pretrained model, ensure the tokenizer wont index sequences longer # than the number of positional embeddings max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[ pretrained_model_name_or_path] kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) # Instantiate tokenizer. tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) return tokenizer
def test_WordpieceTokenizer(): model = WordpieceTokenizer( tokenization.load_vocab( os.path.join(model_dir, "bert-base-cased-vocab.txt"))) print(model.tokenize("decomposition deoomposition"))
class WordPieceBatch(CharacterBatch, SpecialTokens): def __init__(self, min_char: int, vocab_file: str, lower: bool, add_sentence_boundary: bool, add_word_boundary: bool, use_cuda: bool): super(WordPieceBatch, self).__init__(min_char=min_char, lower=lower, add_sentence_boundary=add_sentence_boundary, add_word_boundary=add_word_boundary, use_cuda=use_cuda) self.vocab = load_vocab(vocab_file=vocab_file) self.tokenizer = WordpieceTokenizer(vocab=self.vocab) def create_one_batch(self, raw_dataset: List[List[str]]): batch_size = len(raw_dataset) seq_len = max([len(input_) for input_ in raw_dataset]) if self.add_sentence_boundary: seq_len += 2 sub_tokens = [] for raw_data in raw_dataset: item = [] for token in raw_data: if self.lower: token = token.lower() item.append(self.tokenizer.tokenize(token)) sub_tokens.append(item) max_char_len = max( [len(token) for item in sub_tokens for token in item]) max_char_len = max(max_char_len, self.min_char) if self.add_word_boundary: max_char_len += 2 batch = torch.LongTensor(batch_size, seq_len, max_char_len).fill_(self.pad_id) lengths = torch.LongTensor(batch_size, seq_len).fill_(1) for i, item in enumerate(sub_tokens): if self.add_sentence_boundary: item = [self.bos] + item + [self.eos] for j, token in enumerate(item): if self.add_sentence_boundary and (token == self.bos or token == self.eos): if self.add_word_boundary: lengths[i, j] = 3 batch[i, j, 0] = self.mapping.get(self.bow) batch[i, j, 1] = self.mapping.get(token) batch[i, j, 2] = self.mapping.get(self.eow) else: lengths[i, j] = 1 batch[i, j, 0] = self.mapping.get(token) else: if self.add_word_boundary: lengths[i, j] = len(token) + 2 batch[i, j, 0] = self.mapping.get(self.bow) for k, sub_token in enumerate(token): batch[i, j, k + 1] = self.mapping.get( sub_token, self.oov_id) batch[i, j, len(token) + 1] = self.mapping.get(self.eow) else: lengths[i, j] = len(token) for k, sub_token in enumerate(token): batch[i, j, k] = self.mapping.get( sub_token, self.oov_id) if self.use_cuda: batch = batch.cuda() lengths = lengths.cuda() return batch, lengths def create_dict_from_dataset(self, raw_dataset: List[List[str]]): n_entries = 0 for raw_data in raw_dataset: for token in raw_data: if self.lower: token = token.lower() for sub_token in self.tokenizer.tokenize(token): if sub_token not in self.mapping: self.mapping[sub_token] = len(self.mapping) n_entries += 1 logger.info('+ loaded {0} entries from input'.format(n_entries)) logger.info('+ current number of entries in mapping is: {0}'.format( len(self.mapping)))
print(list(user_repr.items())[0]) ### ### add skeleton and subword position for d in train: edu = d[2] fas = d[3] idx2fa = {} for k, v in fas.items(): for vv in v: idx2fa[vv] = k _edu = [] fa2pos = {} for idx, token in enumerate(edu.lower().split()): sub_tokens = wordpiece_tokenizer.tokenize(token) pos = [] curr_idx = len(_edu) for sub_token in sub_tokens: _edu.append(sub_token) pos.append(curr_idx) curr_idx += 1 if idx in idx2fa: fa = idx2fa[idx] fa2pos[fa] = pos d.append(_edu) d.append(fa2pos) for d in dev: edu = d[2] fas = d[3]