def __init__(
        self,
        vocab: Optional[str] = None,
        replacement: str = "▁",
        add_prefix_space: bool = True,
    ):
        if vocab is not None:
            # Let Unigram(..) fail if only one of them is None
            tokenizer = Tokenizer(Unigram(vocab))
        else:
            tokenizer = Tokenizer(Unigram())

        tokenizer.normalizer = normalizers.Sequence([
            normalizers.Nmt(),
            normalizers.NFKC(),
        ])
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
            pre_tokenizers.WhitespaceSplit(),
            pre_tokenizers.Metaspace(replacement=replacement,
                                     add_prefix_space=add_prefix_space),
        ])
        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement, add_prefix_space=add_prefix_space)

        parameters = {
            "model": "SentencePieceUnigram",
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
        }

        super().__init__(tokenizer, parameters)
Exemple #2
0
 def pre_tokenizer(self, replacement, add_prefix_space):
     return pre_tokenizers.Sequence(
         [
             pre_tokenizers.WhitespaceSplit(),
             pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
         ]
     )
    def from_spm(filename: str):
        try:
            import sys

            sys.path.append(".")

            import sentencepiece_model_pb2 as model
        except Exception:
            raise Exception(
                "You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
            )

        m = model.ModelProto()
        m.ParseFromString(open(filename, "rb").read())

        precompiled_charsmap = m.normalizer_spec.precompiled_charsmap
        vocab = [(piece.piece, piece.score) for piece in m.pieces]
        unk_id = m.trainer_spec.unk_id
        model_type = m.trainer_spec.model_type
        if model_type != 1:
            raise Exception(
                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
            )

        data = {"unk_id": unk_id, "vocab": vocab}

        replacement = "▁"
        add_prefix_space = True

        out_vocab_filename = f"{filename}.json"
        try:
            with open(out_vocab_filename, "w") as f:
                json.dump(data, f, indent=4)

            tokenizer = Tokenizer(Unigram(out_vocab_filename))
        finally:
            os.remove(out_vocab_filename)

        tokenizer.normalizer = normalizers.Precompiled(precompiled_charsmap)
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [
                pre_tokenizers.WhitespaceSplit(),
                pre_tokenizers.Metaspace(
                    replacement=replacement, add_prefix_space=add_prefix_space
                ),
            ]
        )
        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement, add_prefix_space=add_prefix_space
        )

        parameters = {
            "model": "SentencePieceUnigram",
        }

        obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters)
        BaseTokenizer.__init__(obj, tokenizer, parameters)
        return obj
 def pre_tokenizer(self, replacement, add_prefix_space):
     list_pretokenizers = []
     if self.original_tokenizer.split_by_punct:
         list_pretokenizers.append(
             pre_tokenizers.Punctuation(behavior="isolated"))
     list_pretokenizers.append(
         pre_tokenizers.Metaspace(replacement=replacement,
                                  add_prefix_space=add_prefix_space))
     return pre_tokenizers.Sequence(list_pretokenizers)
Exemple #5
0
    def test_continuing_prefix_trainer_mistmatch(self):
        UNK = "[UNK]"
        special_tokens = [UNK]
        tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))
        trainer = trainers.BpeTrainer(special_tokens=special_tokens)
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=True)]
        )
        tokenizer.train(files=["data/big.txt"], trainer=trainer)

        tokenizer.save("data/tokenizer.json")

        tokenizer.from_file("data/tokenizer.json")
    def __init__(
        self,
        replacement: str = "▁",
        add_prefix_space: bool = True,
        unk_token: Union[str, AddedToken] = "<unk>",
        eos_token: Union[str, AddedToken] = "</s>",
        pad_token: Union[str, AddedToken] = "<pad>",
    ):
        self.special_tokens = {
            "pad": {"id": 0, "token": pad_token},
            "eos": {"id": 1, "token": eos_token},
            "unk": {"id": 2, "token": unk_token},
        }

        self.special_tokens_list = [None] * len(self.special_tokens)
        for token_dict in self.special_tokens.values():
            self.special_tokens_list[token_dict["id"]] = token_dict["token"]

        tokenizer = Tokenizer(Unigram())

        tokenizer.normalizer = normalizers.Sequence(
            [
                normalizers.Nmt(),
                normalizers.NFKC(),
                normalizers.Replace(Regex(" {2,}"), " "),
                normalizers.Lowercase(),
            ]
        )
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [
                pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
                pre_tokenizers.Digits(individual_digits=True),
                pre_tokenizers.Punctuation(),
            ]
        )
        tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)

        tokenizer.post_processor = TemplateProcessing(
            single=f"$A {self.special_tokens['eos']['token']}",
            special_tokens=[(self.special_tokens["eos"]["token"], self.special_tokens["eos"]["id"])],
        )

        parameters = {
            "model": "SentencePieceUnigram",
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
        }

        super().__init__(tokenizer, parameters)
Exemple #7
0
    def configure_tokenizers(self, padding, truncation, max_length, lower):
        # Settings
        pad_length = None
        if padding in {True, "longest"}:
            pass
        elif padding in {"max_length"}:
            pad_length = max_length
        elif padding in {False, "do_not_pad"}:
            pass
        else:
            raise ValueError("Unknown padding type")

        # SRC tokenizer
        tok_normalizers = [NFD(), Strip()]
        if lower:
            tok_normalizers += [Lowercase()]

        self.tokenizer = Tokenizer(tok_model())  # unk_token=... not working
        self.tokenizer.add_special_tokens(self.special_tokens)
        self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [WhitespaceSplit()])
        self.tokenizer.normalizer = normalizers.Sequence(
            tok_normalizers)  # StripAccents requires NFD
        self.tokenizer.decoder = tok_decoder()

        # Define template (Needed for the sos/eos tokens)
        basic_template = TemplateProcessing(
            single=f"{self.SOS_WORD} $A {self.EOS_WORD}",
            pair=
            f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}",
            special_tokens=[
                (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)),
                (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD))
            ],
        )
        self.tokenizer.post_processor = basic_template

        if padding:
            self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id(
                self.PAD_WORD),
                                          pad_token=self.PAD_WORD,
                                          length=pad_length)
        if truncation:
            self.tokenizer.enable_truncation(max_length,
                                             stride=0,
                                             strategy='longest_first')
    def converted(self) -> Tokenizer:
        vocab = self.original_tokenizer.encoder
        merges = list(self.original_tokenizer.bpe_ranks.keys())
        unk_token = self.original_tokenizer.unk_token

        tokenizer = Tokenizer(
            BPE(
                vocab=vocab,
                merges=merges,
                dropout=None,
                continuing_subword_prefix="",
                end_of_word_suffix="</w>",
                fuse_unk=False,
                unk_token=str(unk_token),
            ))

        tokenizer.normalizer = normalizers.Sequence([
            normalizers.NFC(),
            normalizers.Replace(Regex(r"\s+"), " "),
            normalizers.Lowercase()
        ])
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
            pre_tokenizers.Split(
                Regex(
                    r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+"""
                ),
                behavior="removed",
                invert=True,
            ),
            pre_tokenizers.ByteLevel(add_prefix_space=False),
        ])
        tokenizer.decoder = decoders.ByteLevel()

        # Hack to have a ByteLevel and TemplaceProcessor
        tokenizer.post_processor = processors.RobertaProcessing(
            sep=(self.original_tokenizer.eos_token,
                 self.original_tokenizer.eos_token_id),
            cls=(self.original_tokenizer.bos_token,
                 self.original_tokenizer.bos_token_id),
            add_prefix_space=False,
            trim_offsets=False,
        )
        return tokenizer
    def converted(self) -> Tokenizer:
        tokenizer = self.tokenizer(self.proto)

        # Tokenizer assemble
        tokenizer.normalizer = self.normalizer(self.proto)

        replacement = "▁"
        add_prefix_space = True
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
            pre_tokenizers.WhitespaceSplit(),
            pre_tokenizers.Metaspace(replacement=replacement,
                                     add_prefix_space=add_prefix_space),
        ])
        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement, add_prefix_space=add_prefix_space)
        post_processor = self.post_processor()
        if post_processor:
            tokenizer.post_processor = post_processor

        return tokenizer
Exemple #10
0
def train_custom_tokenizer(dataset,
                           token_model,
                           tknzr_file,
                           vocab_size,
                           vocab=None,
                           pretrain_fast=False,
                           max_input_chars_per_word=None,
                           eos_token=None,
                           bos_token=None,
                           pad_token=None,
                           mask_token=None,
                           unk_token=None):
    """
    Building a Tokenizer using HuggingFace library. The pipeline seems to be:

        - Model           : algorithm that tokenizes, it is a mandatory
                            component. There are only 4 models implemented
                            (BPE, Unigram, WordLevel, WordPiece)
        - Normalizer      : some preprocessing that could happen before, but
                            doesn't necessarily have to
        - Pre-Tokenizer   : splitting the input according to some rules
        - Post-Processing : needing to add some tokens/input after (mostly seems
                            to be eos, bos tokens)
        - Decoder         : certain previous pipeline steps need to be reversed
                            for proper decoding
        - Trainer         : The corresponding training algorithm for the model

    Note : Some pre-processing might need to happen beforehand in previous
            functions (might be easier using pandas before)

    Input
        token_model (str)        : algorithm to use for tokenization
        dataset (class)          : a python iterator that goes through the data
                                    to be used for training
        token_dir (str)          : directory with tokenizers
        vocab_size (int)         : size of the vocabulary to use
        tokenFilename (str)     : filename of particular token we want to
                                    train. Will overwrite previously save files.
        vocab (list of str)      : models other than BPE can use non-mandatory
                                    vocab as input
        max_input_chars_per_word : used for WordPiece

    Output
        tokenizer                : huggingFace Tokenizer object, our fully
                                    trainer tokenizer

    """
    special_token_lst = [
        pad_token, bos_token, eos_token, mask_token, unk_token
    ]

    # NFKC
    normalizer_lst = []
    pre_tokenizer_lst = [Whitespace, ByteLevel]
    decoder_lst = []

    bos_idx = special_token_lst.index(bos_token)
    eos_idx = special_token_lst.index(eos_token)

    if token_model == 'BPE':
        model = BPE(unk_token=unk_token)
        Trainer = BpeTrainer
    elif token_model == 'Unigram':
        model = Unigram(vocab=vocab)
        Trainer = UnigramTrainer
    elif token_model == 'WordLevel':
        model = WordLevel(unk_token=unk_token, vocab=vocab)
        Trainer = WordLevelTrainer
    elif token_model == 'WordPiece':
        model = WordPiece(unk_token=unk_token,
                          vocab=vocab,
                          max_input_chars_per_word=max_input_chars_per_word)
        Trainer = WordPieceTrainer
    else:
        error_msg = f'Error: token_model ({token_model}) not an algorithm in%s' \
                    % VALID_TOKENIZATIONS
        raise SystemExit(error_msg)

    # instantiation
    tokenizer = Tokenizer(model)

    # Select a tokenization trainer
    if vocab_size is None:
        trainer = Trainer(show_progress=True, special_tokens=special_token_lst)
    else:
        trainer = Trainer(vocab_size=vocab_size,
                          show_progress=True,
                          special_tokens=special_token_lst)

    # Set the normalizer
    tokenizer.normalizer = normalizers.Sequence(
        [fcn() for fcn in normalizer_lst])

    # Set the pre-tokenizer
    tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
        [fcn() for fcn in pre_tokenizer_lst])

    # Set the post-processing
    tokenizer.post_processor = processors.TemplateProcessing(
        single=bos_token + " $A " + eos_token,
        special_tokens=[(bos_token, bos_idx), (eos_token, eos_idx)],
        #  pair=bos_token+" $A "+eos_token" $B:1 "+eos_token+":1",
    )

    # Set the decoder
    if ByteLevel in pre_tokenizer_lst:
        tokenizer.decoder = decoders.ByteLevel()
    if Metaspace in pre_tokenizer_lst:
        tokenizer.decoder = decoders.Metaspace()
    if token_model == 'WordPiece':
        tokenizer.decoder = decoders.WordPiece()

    # creating iterator
    def batch_iterator():
        for i in np.arange(0, len(dataset)):
            yield dataset[i]

    # train call
    tokenizer.train_from_iterator(trainer=trainer,
                                  iterator=batch_iterator(),
                                  length=len(dataset))

    if Path(tknzr_file).exists():
        print(f"Warning : overwriting previously save tokenizer with\
                        same filename ( {tknzr_file} ).")
    tokenizer.save(tknzr_file)

    if pretrain_fast:
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tknzr_file)
    else:
        tokenizer = PreTrainedTokenizer(tokenizer_file=tknzr_file)
    tokenizer.pad_token = pad_token
    tokenizer.mask_token = mask_token

    return tokenizer
def get_final_text(pred_text, orig_text, do_lower_case):
    """Project the tokenized prediction back to the original text."""

    # When we created the data, we kept track of the alignment between original
    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
    # now `orig_text` contains the span of our original text corresponding to the
    # span that we predicted.
    #
    # However, `orig_text` may contain extra characters that we don't want in
    # our prediction.
    #
    # For example, let's say:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # We don't want to return `orig_text` because it contains the extra "'s".
    #
    # We don't want to return `pred_text` because it's already been normalized
    # (the SQuAD eval script also does punctuation stripping/lower casing but
    # our tokenizer does additional normalization like stripping accent
    # characters).
    #
    # What we really want to return is "Steve Smith".
    #
    # Therefore, we have to apply a semi-complicated alignment heruistic between
    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
    # can fail in certain cases in which case we just return `orig_text`.

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
    tokenizer = pre_tokenizers.Sequence(
        [pre_tokenizers.Whitespace(),
         pre_tokenizers.Punctuation()])

    tok_text = []
    for item in tokenizer.pre_tokenize_str(orig_text):
        tok_text.append(item[0])

    tok_text = " ".join(tok_text)

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text