コード例 #1
0
    def test_wordpiece_tokenizer(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
            "runn", "##ing"
        ]

        vocab = {}
        for (i, token) in enumerate(vocab_tokens):
            vocab[token] = i
        tokenizer = WordpieceTokenizer(vocab=vocab)

        self.assertListEqual(tokenizer.tokenize(""), [])

        self.assertListEqual(tokenizer.tokenize("unwanted running"),
                             ["un", "##want", "##ed", "runn", "##ing"])

        self.assertListEqual(tokenizer.tokenize("unwantedX running"),
                             ["[UNK]", "runn", "##ing"])
コード例 #2
0
ファイル: subword.py プロジェクト: seongl/claf
class SubwordTokenizer(Tokenizer):
    """
    Subword Tokenizer

    text -> [word tokens] -> [[sub word tokens], ...]

    * Args:
        name: tokenizer name [wordpiece]
    """

    def __init__(self, name, word_tokenizer, config={}):
        super(SubwordTokenizer, self).__init__(name, f"subword-{name}+{word_tokenizer.cache_name}")
        self.data_handler = DataHandler(CachePath.VOCAB)
        self.config = config
        self.word_tokenizer = word_tokenizer
        self.subword_tokenizer = None

    """ Tokenizers """

    def _wordpiece(self, text, unit="text"):
        """
        ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld']
        """
        if self.subword_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True)
            vocab = load_vocab(vocab_path)
            self.subword_tokenizer = WordpieceTokenizer(vocab)

        tokens = []

        if unit == "word":
            for sub_token in self.subword_tokenizer.tokenize(text):
                tokens.append(sub_token)
        else:
            for token in self.word_tokenizer.tokenize(text):
                for sub_token in self.subword_tokenizer.tokenize(token):
                    tokens.append(sub_token)

        return tokens
コード例 #3
0
class BertLabelTokenizer:
    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
    def __init__(self,
                 vocab_file,
                 max_len=None,
                 do_basic_tokenize=True,
                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        """Constructs a BertTokenizer.

        Args:
          vocab_file: Path to a one-wordpiece-per-line vocabulary file
          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
          max_len: An artificial maximum length to truncate tokenized sequences to;
                         Effective maximum length is always the minimum of this
                         value (if specified) and the underlying BERT model's
                         sequence length.
          never_split: List of tokens which will never be split during tokenization.
                         Only has an effect when do_wordpiece_only=False
        """
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                .format(vocab_file))
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)

    def tokenize(self, text):
        split_tokens = []
        token_begin_mask = []
        for token in text:
            wordpieces = self.wordpiece_tokenizer.tokenize(token)
            if len(wordpieces) > 0:
                for sub_token in wordpieces:
                    split_tokens.append(sub_token)
                token_begin_mask += [1] + [0] * (len(wordpieces) - 1)
        return split_tokens, token_begin_mask

    def tokenize_labels(self, text, labels):
        split_tokens = []
        split_labels = []
        token_begin_mask = []
        for token, label in zip(text, labels):
            wordpieces = self.wordpiece_tokenizer.tokenize(token)
            if len(wordpieces) > 0:
                for sub_token in wordpieces:
                    split_tokens.append(sub_token)
                split_labels += [label] + ["X"] * (len(wordpieces) - 1)
                token_begin_mask += [1] + [0] * (len(wordpieces) - 1)
        return split_tokens, split_labels, token_begin_mask

    def convert_tokens_to_ids(self, tokens):
        """Converts a sequence of tokens into ids using the vocab."""
        ids = []
        for token in tokens:
            ids.append(self.vocab[token])
        if len(ids) > self.max_len:
            logger.warning(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this BERT model ({} > {}). Running this"
                " sequence through BERT will result in indexing errors".format(
                    len(ids), self.max_len))
        return ids

    def convert_ids_to_tokens(self, ids):
        """Converts a sequence of ids in wordpiece tokens using the vocab."""
        tokens = []
        for i in ids:
            tokens.append(self.ids_to_tokens[i])
        return tokens

    @classmethod
    def from_pretrained(cls,
                        pretrained_model_name_or_path,
                        cache_dir=None,
                        *inputs,
                        **kwargs):
        """
        Instantiate a PreTrainedBertModel from a pre-trained model file.
        Download and cache the pre-trained model file if needed.
        """
        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
                pretrained_model_name_or_path]
        else:
            vocab_file = pretrained_model_name_or_path
        if os.path.isdir(vocab_file):
            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
        # redirect to the cache, if necessary
        try:
            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
        except EnvironmentError:
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find any file "
                "associated to this path or url.".format(
                    pretrained_model_name_or_path,
                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
                    vocab_file))
            return None
        if resolved_vocab_file == vocab_file:
            logger.info("loading vocabulary file {}".format(vocab_file))
        else:
            logger.info("loading vocabulary file {} from cache at {}".format(
                vocab_file, resolved_vocab_file))
        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
            # than the number of positional embeddings
            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
                pretrained_model_name_or_path]
            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
        # Instantiate tokenizer.
        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
        return tokenizer
コード例 #4
0
def test_WordpieceTokenizer():
    model = WordpieceTokenizer(
        tokenization.load_vocab(
            os.path.join(model_dir, "bert-base-cased-vocab.txt")))
    print(model.tokenize("decomposition deoomposition"))
コード例 #5
0
class WordPieceBatch(CharacterBatch, SpecialTokens):
    def __init__(self, min_char: int, vocab_file: str, lower: bool,
                 add_sentence_boundary: bool, add_word_boundary: bool,
                 use_cuda: bool):
        super(WordPieceBatch,
              self).__init__(min_char=min_char,
                             lower=lower,
                             add_sentence_boundary=add_sentence_boundary,
                             add_word_boundary=add_word_boundary,
                             use_cuda=use_cuda)
        self.vocab = load_vocab(vocab_file=vocab_file)
        self.tokenizer = WordpieceTokenizer(vocab=self.vocab)

    def create_one_batch(self, raw_dataset: List[List[str]]):
        batch_size = len(raw_dataset)
        seq_len = max([len(input_) for input_ in raw_dataset])
        if self.add_sentence_boundary:
            seq_len += 2

        sub_tokens = []
        for raw_data in raw_dataset:
            item = []
            for token in raw_data:
                if self.lower:
                    token = token.lower()
                item.append(self.tokenizer.tokenize(token))
            sub_tokens.append(item)
        max_char_len = max(
            [len(token) for item in sub_tokens for token in item])
        max_char_len = max(max_char_len, self.min_char)
        if self.add_word_boundary:
            max_char_len += 2

        batch = torch.LongTensor(batch_size, seq_len,
                                 max_char_len).fill_(self.pad_id)
        lengths = torch.LongTensor(batch_size, seq_len).fill_(1)
        for i, item in enumerate(sub_tokens):
            if self.add_sentence_boundary:
                item = [self.bos] + item + [self.eos]
            for j, token in enumerate(item):
                if self.add_sentence_boundary and (token == self.bos
                                                   or token == self.eos):
                    if self.add_word_boundary:
                        lengths[i, j] = 3
                        batch[i, j, 0] = self.mapping.get(self.bow)
                        batch[i, j, 1] = self.mapping.get(token)
                        batch[i, j, 2] = self.mapping.get(self.eow)
                    else:
                        lengths[i, j] = 1
                        batch[i, j, 0] = self.mapping.get(token)
                else:
                    if self.add_word_boundary:
                        lengths[i, j] = len(token) + 2
                        batch[i, j, 0] = self.mapping.get(self.bow)
                        for k, sub_token in enumerate(token):
                            batch[i, j, k + 1] = self.mapping.get(
                                sub_token, self.oov_id)
                        batch[i, j,
                              len(token) + 1] = self.mapping.get(self.eow)
                    else:
                        lengths[i, j] = len(token)
                        for k, sub_token in enumerate(token):
                            batch[i, j, k] = self.mapping.get(
                                sub_token, self.oov_id)

        if self.use_cuda:
            batch = batch.cuda()
            lengths = lengths.cuda()
        return batch, lengths

    def create_dict_from_dataset(self, raw_dataset: List[List[str]]):
        n_entries = 0
        for raw_data in raw_dataset:
            for token in raw_data:
                if self.lower:
                    token = token.lower()
                for sub_token in self.tokenizer.tokenize(token):
                    if sub_token not in self.mapping:
                        self.mapping[sub_token] = len(self.mapping)
                        n_entries += 1
        logger.info('+ loaded {0} entries from input'.format(n_entries))
        logger.info('+ current number of entries in mapping is: {0}'.format(
            len(self.mapping)))
コード例 #6
0
print(list(user_repr.items())[0])

###
### add skeleton and subword position
for d in train:
    edu = d[2]
    fas = d[3]
    idx2fa = {}
    for k, v in fas.items():
        for vv in v:
            idx2fa[vv] = k
    _edu = []
    fa2pos = {}
    for idx, token in enumerate(edu.lower().split()):
        sub_tokens = wordpiece_tokenizer.tokenize(token)
        pos = []
        curr_idx = len(_edu)
        for sub_token in sub_tokens:
            _edu.append(sub_token)
            pos.append(curr_idx)
            curr_idx += 1
        if idx in idx2fa:
            fa = idx2fa[idx]
            fa2pos[fa] = pos
    d.append(_edu)
    d.append(fa2pos)

for d in dev:
    edu = d[2]
    fas = d[3]