def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) unk_token = self.original_tokenizer.unk_token tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="</w>", fuse_unk=False, unk_token=str(unk_token), )) tokenizer.normalizer = normalizers.Sequence([ normalizers.NFC(), normalizers.Replace(Regex(r"\s+"), " "), normalizers.Lowercase() ]) tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ pre_tokenizers.Split( Regex( r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""" ), behavior="removed", invert=True, ), pre_tokenizers.ByteLevel(add_prefix_space=False), ]) tokenizer.decoder = decoders.ByteLevel() # Hack to have a ByteLevel and TemplaceProcessor tokenizer.post_processor = processors.RobertaProcessing( sep=(self.original_tokenizer.eos_token, self.original_tokenizer.eos_token_id), cls=(self.original_tokenizer.bos_token, self.original_tokenizer.bos_token_id), add_prefix_space=False, trim_offsets=False, ) return tokenizer
def normalizer(self, proto): normalizers = [Replace("``", '"'), Replace("''", '"')] if not self.original_tokenizer.keep_accents: normalizers.append(NFKD()) normalizers.append(StripAccents()) if self.original_tokenizer.do_lower_case: normalizers.append(Lowercase()) precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap normalizers.append(Precompiled(precompiled_charsmap)) normalizers.append(Replace(Regex(" {2,}"), " ")) return Sequence(normalizers)
def normalizer(self, proto): list_normalizers = [] if self.original_tokenizer.do_lower_case: list_normalizers.append(normalizers.Lowercase()) list_normalizers.append(normalizers.Strip()) precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap if precompiled_charsmap: list_normalizers.append( normalizers.Precompiled(precompiled_charsmap)) list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " ")) return normalizers.Sequence(list_normalizers)
def __init__( self, replacement: str = "▁", add_prefix_space: bool = True, unk_token: Union[str, AddedToken] = "<unk>", eos_token: Union[str, AddedToken] = "</s>", pad_token: Union[str, AddedToken] = "<pad>", ): self.special_tokens = { "pad": {"id": 0, "token": pad_token}, "eos": {"id": 1, "token": eos_token}, "unk": {"id": 2, "token": unk_token}, } self.special_tokens_list = [None] * len(self.special_tokens) for token_dict in self.special_tokens.values(): self.special_tokens_list[token_dict["id"]] = token_dict["token"] tokenizer = Tokenizer(Unigram()) tokenizer.normalizer = normalizers.Sequence( [ normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " "), normalizers.Lowercase(), ] ) tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [ pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), pre_tokenizers.Digits(individual_digits=True), pre_tokenizers.Punctuation(), ] ) tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.post_processor = TemplateProcessing( single=f"$A {self.special_tokens['eos']['token']}", special_tokens=[(self.special_tokens["eos"]["token"], self.special_tokens["eos"]["id"])], ) parameters = { "model": "SentencePieceUnigram", "replacement": replacement, "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)
def from_spm(filename: str): try: import sys sys.path.append(".") import sentencepiece_model_pb2 as model except Exception: raise Exception( "You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required." ) m = model.ModelProto() m.ParseFromString(open(filename, "rb").read()) precompiled_charsmap = m.normalizer_spec.precompiled_charsmap vocab = [(piece.piece, piece.score) for piece in m.pieces] unk_id = m.trainer_spec.unk_id model_type = m.trainer_spec.model_type if model_type != 1: raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" ) replacement = "▁" add_prefix_space = True tokenizer = Tokenizer(Unigram(vocab, unk_id)) tokenizer.normalizer = normalizers.Sequence( [ normalizers.Precompiled(precompiled_charsmap), normalizers.Replace(Regex(" {2,}"), " "), ] ) tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space ) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space ) parameters = { "model": "SentencePieceUnigram", } obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters) BaseTokenizer.__init__(obj, tokenizer, parameters) return obj
def normalizer(self, proto): list_normalizers = [ normalizers.Replace("``", '"'), normalizers.Replace("''", '"'), ] if not self.original_tokenizer.keep_accents: list_normalizers.append(normalizers.NFKD()) list_normalizers.append(normalizers.StripAccents()) if self.original_tokenizer.do_lower_case: list_normalizers.append(normalizers.Lowercase()) precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap list_normalizers.append(normalizers.Precompiled(precompiled_charsmap)) list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " ")) return normalizers.Sequence(list_normalizers)
def normalizer(self, proto): precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap return Sequence( [Precompiled(precompiled_charsmap), Replace(Regex(" {2,}"), " ")])
from itertools import product from tokenizers import Tokenizer, Regex from tokenizers.models import WordLevel from tokenizers.normalizers import NFD from tokenizers.trainers import WordLevelTrainer from tokenizers.pre_tokenizers import Split, WhitespaceSplit ## Loop which creates and loads the tokenizer def stackTraceTokenizer(tokens:tuple, events:tuple, vocab_size=2_000, min_freq=3): for norm, event in product(tokens, events): print(norm, event) tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) tokenizer.normalizer = NFD() if norm == 'white': tokenizer.pre_tokenizer = WhitespaceSplit() else: tokenizer.pre_tokenizer = Split(pattern=Regex("[A-Z]+[a-z0-9]+|[.A-Z]+|[a-z0-9]+"), behavior='isolated') trainer = WordLevelTrainer(vocab_size=vocab_size, min_frequency=min_freq, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(["vocab-{}.txt".format(event)], trainer) print(f"Trained tokenizer for {norm}:{event}") yield tokenizer, event, norm