def pre_tokenizer(self, replacement, add_prefix_space): return pre_tokenizers.Sequence( [ pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), ] )
def __init__( self, vocab: Optional[str] = None, replacement: str = "▁", add_prefix_space: bool = True, ): if vocab is not None: # Let Unigram(..) fail if only one of them is None tokenizer = Tokenizer(Unigram(vocab)) else: tokenizer = Tokenizer(Unigram()) tokenizer.normalizer = normalizers.Sequence([ normalizers.Nmt(), normalizers.NFKC(), ]) tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), ]) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) parameters = { "model": "SentencePieceUnigram", "replacement": replacement, "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)
def from_spm(filename: str): try: import sys sys.path.append(".") import sentencepiece_model_pb2 as model except Exception: raise Exception( "You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required." ) m = model.ModelProto() m.ParseFromString(open(filename, "rb").read()) precompiled_charsmap = m.normalizer_spec.precompiled_charsmap vocab = [(piece.piece, piece.score) for piece in m.pieces] unk_id = m.trainer_spec.unk_id model_type = m.trainer_spec.model_type if model_type != 1: raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" ) data = {"unk_id": unk_id, "vocab": vocab} replacement = "▁" add_prefix_space = True out_vocab_filename = f"{filename}.json" try: with open(out_vocab_filename, "w") as f: json.dump(data, f, indent=4) tokenizer = Tokenizer(Unigram(out_vocab_filename)) finally: os.remove(out_vocab_filename) tokenizer.normalizer = normalizers.Precompiled(precompiled_charsmap) tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [ pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space ), ] ) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space ) parameters = { "model": "SentencePieceUnigram", } obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters) BaseTokenizer.__init__(obj, tokenizer, parameters) return obj
def __init__( self, vocab_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "[UNK]", pad_token: Union[str, AddedToken] = "[PAD]", mask_token: Union[str, AddedToken] = "[MASK]", lowercase: bool = False, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None: logging.info(f"Initiating tokenizer at {vocab_file}") tokenizer = Tokenizer( WordLevel(vocab=vocab_file, unk_token=unk_token)) else: tokenizer = Tokenizer(WordLevel(unk_token=unk_token)) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() parameters = { "model": "WordLevel", "unk_token": unk_token, "pad_token": pad_token, "mask_token": mask_token, "lowercase": lowercase, "unicode_normalizer": unicode_normalizer, } super().__init__(tokenizer, parameters)
def converted(self) -> Tokenizer: tokenizer = self.tokenizer(self.proto) # Tokenizer assemble tokenizer.normalizer = self.normalizer(self.proto) replacement = "▁" add_prefix_space = True tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), ]) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) post_processor = self.post_processor() if post_processor: tokenizer.post_processor = post_processor return tokenizer
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, unk_token: Union[str, AddedToken] = "[UNK]", sep_token: Union[str, AddedToken] = "[SEP]", cls_token: Union[str, AddedToken] = "[CLS]", pad_token: Union[str, AddedToken] = "[PAD]", mask_token: Union[str, AddedToken] = "[MASK]", num_unused_tokens: int = 10, mecab_dic_type: str = "unidic_lite", wordpieces_prefix: str = "##", ) -> None: super().__init__( vocab=vocab, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, wordpieces_prefix=wordpieces_prefix, ) self._tokenizer.add_special_tokens( ['<unused{}>'.format(i) for i in range(num_unused_tokens)]) self._tokenizer.normalizer = normalizers.Sequence( [normalizers.NFKC(), normalizers.Strip()]) if mecab_dic_type in ("unidic_lite", "unidic", "ipadic"): self._tokenizer.pre_tokenizer = pre_tokenizers.PreTokenizer.custom( MeCabPreTokenizer(mecab_dic_type)) elif mecab_dic_type == "whitespace": self._tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() else: raise ValueError("Invalid pre_tokenizer_type is specified.") parameters = { "model": "BertWordPieceJapaneseTokenizer", "mecab_dic_type": mecab_dic_type, } self._parameters.update(parameters)
def __init__( self, vocab_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "<unk>", sep_token: Union[str, AddedToken] = "<sep>", cls_token: Union[str, AddedToken] = "<cls>", pad_token: Union[str, AddedToken] = "<pad>", mask_token: Union[str, AddedToken] = "<mask>", lowercase: bool = False, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None: tokenizer = Tokenizer(WordLevel(vocab_file)) else: tokenizer = Tokenizer(WordLevel()) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() if vocab_file is not None: sep_token_id = tokenizer.token_to_id(str(sep_token)) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(str(cls_token)) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = processors.BertProcessing( (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)) parameters = { "model": "WordLevel", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "lowercase": lowercase, "unicode_normalizer": unicode_normalizer, } super().__init__(tokenizer, parameters)
def __init__( self, target_vocab, ): special_tokens = { "pad_token": "[PAD]", "unk_token": "[UNK]", "sep_token": "[SEP]", "cls_token": "[CLS]", "mask_token": "[MASK]", } vocab = {} vocab[special_tokens["pad_token"]] = 0 tkn_idx = 1 unused_ctr = 0 # not sure whether that's relevant, but fill 1..99 and 105...999 # with unused tokens to keep BERT's tokenizer style # as a result, one can easily identify special tokens: # 0 is padding # 1xx are other special tokens # any four-digit tokens are actual payload fill_tokens = False if(fill_tokens): while(tkn_idx < 100): vocab[f"[unused{unused_ctr}]"] = tkn_idx tkn_idx += 1 unused_ctr += 1 for token in ["unk_token", "cls_token", "sep_token", "mask_token"]: vocab[special_tokens[token]] = tkn_idx tkn_idx += 1 if(fill_tokens): while(tkn_idx < 1000): vocab[f"[unused{unused_ctr}]"] = tkn_idx tkn_idx += 1 unused_ctr += 1 for word in target_vocab: vocab[word] = tkn_idx tkn_idx += 1 tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token=special_tokens["unk_token"])) tokenizer.add_special_tokens(list(special_tokens.values())) tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() sep_token_id = tokenizer.token_to_id(special_tokens["sep_token"]) cls_token_id = tokenizer.token_to_id(special_tokens["cls_token"]) tokenizer.post_processor = processors.BertProcessing( (special_tokens["sep_token"], sep_token_id), (special_tokens["cls_token"], cls_token_id) ) parameters = special_tokens parameters["model"] = "WordLevel" super().__init__(tokenizer, parameters) tokenizer.save(PRETRAINED_TOKENIZER_FILE)