Beispiel #1
0
 def pre_tokenizer(self, replacement, add_prefix_space):
     return pre_tokenizers.Sequence(
         [
             pre_tokenizers.WhitespaceSplit(),
             pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
         ]
     )
    def __init__(
        self,
        vocab: Optional[str] = None,
        replacement: str = "▁",
        add_prefix_space: bool = True,
    ):
        if vocab is not None:
            # Let Unigram(..) fail if only one of them is None
            tokenizer = Tokenizer(Unigram(vocab))
        else:
            tokenizer = Tokenizer(Unigram())

        tokenizer.normalizer = normalizers.Sequence([
            normalizers.Nmt(),
            normalizers.NFKC(),
        ])
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
            pre_tokenizers.WhitespaceSplit(),
            pre_tokenizers.Metaspace(replacement=replacement,
                                     add_prefix_space=add_prefix_space),
        ])
        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement, add_prefix_space=add_prefix_space)

        parameters = {
            "model": "SentencePieceUnigram",
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
        }

        super().__init__(tokenizer, parameters)
    def from_spm(filename: str):
        try:
            import sys

            sys.path.append(".")

            import sentencepiece_model_pb2 as model
        except Exception:
            raise Exception(
                "You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
            )

        m = model.ModelProto()
        m.ParseFromString(open(filename, "rb").read())

        precompiled_charsmap = m.normalizer_spec.precompiled_charsmap
        vocab = [(piece.piece, piece.score) for piece in m.pieces]
        unk_id = m.trainer_spec.unk_id
        model_type = m.trainer_spec.model_type
        if model_type != 1:
            raise Exception(
                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
            )

        data = {"unk_id": unk_id, "vocab": vocab}

        replacement = "▁"
        add_prefix_space = True

        out_vocab_filename = f"{filename}.json"
        try:
            with open(out_vocab_filename, "w") as f:
                json.dump(data, f, indent=4)

            tokenizer = Tokenizer(Unigram(out_vocab_filename))
        finally:
            os.remove(out_vocab_filename)

        tokenizer.normalizer = normalizers.Precompiled(precompiled_charsmap)
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [
                pre_tokenizers.WhitespaceSplit(),
                pre_tokenizers.Metaspace(
                    replacement=replacement, add_prefix_space=add_prefix_space
                ),
            ]
        )
        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement, add_prefix_space=add_prefix_space
        )

        parameters = {
            "model": "SentencePieceUnigram",
        }

        obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters)
        BaseTokenizer.__init__(obj, tokenizer, parameters)
        return obj
Beispiel #4
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        unk_token: Union[str, AddedToken] = "[UNK]",
        pad_token: Union[str, AddedToken] = "[PAD]",
        mask_token: Union[str, AddedToken] = "[MASK]",
        lowercase: bool = False,
        unicode_normalizer: Optional[str] = None,
    ):
        if vocab_file is not None:
            logging.info(f"Initiating tokenizer at {vocab_file}")
            tokenizer = Tokenizer(
                WordLevel(vocab=vocab_file, unk_token=unk_token))
        else:
            tokenizer = Tokenizer(WordLevel(unk_token=unk_token))

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if lowercase:
            normalizers += [Lowercase()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

        parameters = {
            "model": "WordLevel",
            "unk_token": unk_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "lowercase": lowercase,
            "unicode_normalizer": unicode_normalizer,
        }

        super().__init__(tokenizer, parameters)
    def converted(self) -> Tokenizer:
        tokenizer = self.tokenizer(self.proto)

        # Tokenizer assemble
        tokenizer.normalizer = self.normalizer(self.proto)

        replacement = "▁"
        add_prefix_space = True
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
            pre_tokenizers.WhitespaceSplit(),
            pre_tokenizers.Metaspace(replacement=replacement,
                                     add_prefix_space=add_prefix_space),
        ])
        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement, add_prefix_space=add_prefix_space)
        post_processor = self.post_processor()
        if post_processor:
            tokenizer.post_processor = post_processor

        return tokenizer
Beispiel #6
0
    def __init__(
        self,
        vocab: Optional[Union[str, Dict[str, int]]] = None,
        unk_token: Union[str, AddedToken] = "[UNK]",
        sep_token: Union[str, AddedToken] = "[SEP]",
        cls_token: Union[str, AddedToken] = "[CLS]",
        pad_token: Union[str, AddedToken] = "[PAD]",
        mask_token: Union[str, AddedToken] = "[MASK]",
        num_unused_tokens: int = 10,
        mecab_dic_type: str = "unidic_lite",
        wordpieces_prefix: str = "##",
    ) -> None:
        super().__init__(
            vocab=vocab,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            wordpieces_prefix=wordpieces_prefix,
        )
        self._tokenizer.add_special_tokens(
            ['<unused{}>'.format(i) for i in range(num_unused_tokens)])

        self._tokenizer.normalizer = normalizers.Sequence(
            [normalizers.NFKC(), normalizers.Strip()])
        if mecab_dic_type in ("unidic_lite", "unidic", "ipadic"):
            self._tokenizer.pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
                MeCabPreTokenizer(mecab_dic_type))
        elif mecab_dic_type == "whitespace":
            self._tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
        else:
            raise ValueError("Invalid pre_tokenizer_type is specified.")

        parameters = {
            "model": "BertWordPieceJapaneseTokenizer",
            "mecab_dic_type": mecab_dic_type,
        }
        self._parameters.update(parameters)
Beispiel #7
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        unk_token: Union[str, AddedToken] = "<unk>",
        sep_token: Union[str, AddedToken] = "<sep>",
        cls_token: Union[str, AddedToken] = "<cls>",
        pad_token: Union[str, AddedToken] = "<pad>",
        mask_token: Union[str, AddedToken] = "<mask>",
        lowercase: bool = False,
        unicode_normalizer: Optional[str] = None,
    ):
        if vocab_file is not None:
            tokenizer = Tokenizer(WordLevel(vocab_file))
        else:
            tokenizer = Tokenizer(WordLevel())

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(sep_token)) is not None:
            tokenizer.add_special_tokens([str(sep_token)])
        if tokenizer.token_to_id(str(cls_token)) is not None:
            tokenizer.add_special_tokens([str(cls_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if lowercase:
            normalizers += [Lowercase()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

        if vocab_file is not None:
            sep_token_id = tokenizer.token_to_id(str(sep_token))
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
            cls_token_id = tokenizer.token_to_id(str(cls_token))
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")

            tokenizer.post_processor = processors.BertProcessing(
                (str(sep_token), sep_token_id), (str(cls_token), cls_token_id))

        parameters = {
            "model": "WordLevel",
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "lowercase": lowercase,
            "unicode_normalizer": unicode_normalizer,
        }

        super().__init__(tokenizer, parameters)
    def __init__(
        self,
        target_vocab,
    ):
        special_tokens = {
            "pad_token": "[PAD]",
            "unk_token": "[UNK]",
            "sep_token": "[SEP]",
            "cls_token": "[CLS]",
            "mask_token": "[MASK]",
        }

        vocab = {}
        vocab[special_tokens["pad_token"]] = 0

        tkn_idx = 1
        unused_ctr = 0

        # not sure whether that's relevant, but fill 1..99  and 105...999
        # with unused tokens to keep BERT's tokenizer style
        # as a result, one can easily identify special tokens:
        # 0 is padding
        # 1xx are other special tokens
        # any four-digit tokens are actual payload
        fill_tokens = False

        if(fill_tokens):
            while(tkn_idx < 100):
                vocab[f"[unused{unused_ctr}]"] = tkn_idx
                tkn_idx += 1
                unused_ctr += 1

        for token in ["unk_token", "cls_token", "sep_token", "mask_token"]:
            vocab[special_tokens[token]] = tkn_idx
            tkn_idx += 1

        if(fill_tokens):
            while(tkn_idx < 1000):
                vocab[f"[unused{unused_ctr}]"] = tkn_idx
                tkn_idx += 1
                unused_ctr += 1

        for word in target_vocab:
            vocab[word] = tkn_idx
            tkn_idx += 1

        tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token=special_tokens["unk_token"]))
        tokenizer.add_special_tokens(list(special_tokens.values()))
        tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

        sep_token_id = tokenizer.token_to_id(special_tokens["sep_token"])
        cls_token_id = tokenizer.token_to_id(special_tokens["cls_token"])

        tokenizer.post_processor = processors.BertProcessing(
            (special_tokens["sep_token"], sep_token_id), (special_tokens["cls_token"], cls_token_id)
        )

        parameters = special_tokens
        parameters["model"] = "WordLevel"

        super().__init__(tokenizer, parameters)

        tokenizer.save(PRETRAINED_TOKENIZER_FILE)