def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> AuxiliaryFile: subset = self._create_subset_file(afm, corpus) # Create WordPiece model with a normalizer and pre-tokenizer. Note that # BERT-specific normalizer and pre-tokenizer are used in this model. tokenizer = Tokenizer(WordPiece()) tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() # Train tokenizer model with subset of corpus. trainer = WordPieceTrainer(vocab_size=self.vocab_size, min_frequency=2, show_progress=True, limit_alphabet=self.limit_alphabet, special_tokens=[self.unk_token] + self.special_tokens, continuing_subword_prefix='##') tokenizer.train(trainer, [subset.name]) # Save trained vocabulary to an auxiliary output file. vocab = afm.create() tokenizer.model.save(os.path.dirname(vocab.name)) os.rename(os.path.join(os.path.dirname(vocab.name), 'vocab.txt'), vocab.name) return vocab
def train_custom_tokenizer(files: List[str], tokenizer_file: str, **kwargs) -> BertWordPieceTokenizer: """ Tokenizerの学習・保存処理:custom PreTokenizer付きのTokenizerを学習・保存する。 """ tokenizer = BertWordPieceTokenizer( handle_chinese_chars=False, # for japanese strip_accents=False, # for japanese ) tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom( MecabPreTokenizer()) # 与えられたコーパスファイル集合からサブワード分割を学習 tokenizer.train(files, **kwargs) # vocab情報に加えて、前処理等パラメータ情報を含んだトークナイザ設定のJSONを保存 # NOTE: Pythonで書かれたcustom PreTokenizerはシリアライズできないので、RustベースのPreTokenizerをダミー注入してシリアライズ # JSONにはダミーのPreTokenizerが記録されるので、ロード時にcustom PreTokenizerを再設定する必要がある。 tokenizer._tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.save(tokenizer_file) # (Optional) .txt形式のvocabファイルは f"vocab-{filename}.txt" で保存される(外部の処理で欲しい場合) filename = "wordpiece" model_files = tokenizer._tokenizer.model.save( str(pathlib.Path(tokenizer_file).parent), filename) return tokenizer
def save_pretrained(self, dir, legacy_format=None, pre=None, push_to_hub=False, **kw): self.backend_tokenizer.pre_tokenizer = BertPreTokenizer() return super().save_pretrained(dir, legacy_format, pre, push_to_hub, **kw)
def __init__( self, vocab_file: Optional[str] = None, add_special_tokens: bool = True, unk_token: str = "[UNK]", sep_token: str = "[SEP]", cls_token: str = "[CLS]", clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", ): if vocab_file is not None: tokenizer = Tokenizer( WordPiece.from_files(vocab_file, unk_token=unk_token)) else: tokenizer = Tokenizer(WordPiece.empty()) tokenizer.add_special_tokens([unk_token, sep_token, cls_token]) tokenizer.normalizer = BertNormalizer( clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase, ) tokenizer.pre_tokenizer = BertPreTokenizer() if add_special_tokens and vocab_file is not None: sep_token_id = tokenizer.token_to_id(sep_token) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(cls_token) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (sep_token, sep_token_id), (cls_token, cls_token_id)) tokenizer.decoders = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "add_special_tokens": add_special_tokens, "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "strip_accents": strip_accents, "lowercase": lowercase, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
def save_pretrained( self, save_directory, legacy_format=None, filename_prefix=None, push_to_hub=False, **kwargs, ): self.backend_tokenizer.pre_tokenizer = BertPreTokenizer() return super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)
def tokenize_corpus( input_file: str, output_file: str, vocab_file: str, unk_token: str = '<unk>', control_tokens: List[str] = []): r"""Tokenize corpus sentences through trained **WordPiece** model. Arguments: input_file (str): Input corpus file path. output_file (str): Output file path. vocab_file (str): Trained vocabulary file path. unk_token (str): Unknown token in the vocabulary. control_tokens (list): Control tokens in the vocabulary. """ # Create `WordPiece` model and add special tokens. Note that `unk_token` # is also a special token.normalizer and pre-tokenizer. tokenizer = Tokenizer(models.WordPiece(vocab_file, unk_token=unk_token)) tokenizer.add_special_tokens([unk_token] + control_tokens) # Use BERT-specific normalizer, pre-tokenizer and **WordPiece** decoder. tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = decoders.WordPiece(prefix='##') with open(input_file, 'r', encoding='utf-8') as src, \ open(output_file, 'w', encoding='utf-8') as dst: # Count total lines in corpus. total_lines = 0 for _ in src: total_lines += 1 # Move the corpus file to first. src.seek(0) buffer = [] for line in tqdm.tqdm(src, desc='[*] tokenize corpus', total=total_lines): buffer.append(line) # Tokenize buffered sentences and write to `output_file`. if len(buffer) > 10000: for t in tokenizer.encode_batch(buffer): dst.write(' '.join(t.tokens) + '\n') buffer.clear() # Process the remained buffer. if buffer: for t in tokenizer.encode_batch(buffer): dst.write(' '.join(t.tokens) + '\n')
def __init__(self, pretokenizer='moses'): self.tagger = PerceptronTagger() self.pretok_type = pretokenizer if pretokenizer == 'bertpretokenizer': self.pretokenizer = BertPreTokenizer() elif pretokenizer == 'moses': self.pretokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() elif pretokenizer == 'whitespace': pass else: raise ValueError( "pretokenizer must be 'bertpretokenizer', 'moses', or 'whitespace'." )
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Optional[str] = "<unk>", suffix: Optional[str] = "</w>", dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] # OpenAI normalization is the same as Bert normalizers += [BertNormalizer()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = BPEDecoder(suffix=suffix) parameters = { "model": "BPE", "unk_token": unk_token, "suffix": suffix, "dropout": dropout, } super().__init__(tokenizer, parameters)
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile, vocab: AuxiliaryFile) -> AuxiliaryFile: total_lines = self._total_lines_in_file(corpus) # Create WordPiece model and add special tokens. Note that `unk_token` # is also a special token. tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token)) tokenizer.add_special_tokens(self.special_tokens + [self.unk_token]) # Use BERT-specific normalizer, pre-tokenizer and decoder. tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = WordPieceDecoder(prefix='##') tokenized = afm.create() with corpus.open('r') as src, tokenized.open('w') as dst: # Create tqdm progress bar with colorful description. tqdm_iter = tqdm.tqdm(src, desc=colorful.render( '<r>[*]</r> tokenize sentences with ' '<g>WordPiece</g> model'), total=total_lines) batch_lines = [] for line in tqdm_iter: batch_lines.append(line) # Encode the grouped batch sentences and write the tokenized # sentences to the auxiliary output file. if len(batch_lines) > self.batch_size: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') batch_lines.clear() # Encode the remainders and write to the output file. if batch_lines: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') return tokenized
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, unk_token: Union[str, AddedToken] = "[UNK]", sep_token: Union[str, AddedToken] = "[SEP]", cls_token: Union[str, AddedToken] = "[CLS]", pad_token: Union[str, AddedToken] = "[PAD]", mask_token: Union[str, AddedToken] = "[MASK]", clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: Optional[bool] = None, lowercase: bool = True, wordpieces_prefix: str = "##", ): if vocab is not None: tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token))) else: tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token))) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) tokenizer.normalizer = BertNormalizer( clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase, ) tokenizer.pre_tokenizer = BertPreTokenizer() if vocab is not None: sep_token_id = tokenizer.token_to_id(str(sep_token)) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(str(cls_token)) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)) tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "strip_accents": strip_accents, "lowercase": lowercase, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
def __getstate__(self): state = self.__dict__.copy() state["_tokenizer"].pre_tokenizer = BertPreTokenizer() return state
def test_instantiate(self): assert BertPreTokenizer() is not None assert isinstance(BertPreTokenizer(), PreTokenizer)
def train_tokenizer( input_file: str, vocab_file: str, temporary: str, subset_size: int = 512000000, vocab_size: int = 8000, limit_alphabet: int = 6000, unk_token: str = '<unk>', control_tokens: List[str] = []): r"""Train **WordPiece** tokenizer and save trained subword vocabulary. Note: Since tokenizers_ reads whole file data in training, this function could occur memory errors if `input_file` is too large. Under the assumption that `input_file` is shuffled randomly, the subset of input corpus will be used in training. Caution: The subset of input corpus is saved in `temporary` directory. Please be careful not to delete the file while executing this function. Arguments: input_file (str): Input file path. vocab_file (str): Output vocabulary file path. temporary (str): Temporary directory where the subset of corpus would be saved. subset_size (int): The maximum number of lines in the subset. vocab_size (int): The number of subwords in the vocabulary. limit_alphabet (int): The maximum number of alphabets in vocabulary. unk_tokens (str): Unknown token in the vocabulary. control_tokens (list): Control tokens in the vocabulary. .. _tokenizers: https://github.com/huggingface/tokenizers """ # Create **WordPiece** model and add normalizer and pre-tokenizer. # BERT-specific normalizer and pre-tokenizer are used. tokenizer = Tokenizer(models.WordPiece()) tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() # Split the head of input corpus file and save in `temporary` directory. subset_file = random_filename(temporary) _split_subset_from_file(input_file, subset_file, subset_size) # Train the model with splitted subset of corpus. trainer = WordPieceTrainer(vocab_size=vocab_size, min_frequency=2, show_progress=True, limit_alphabet=limit_alphabet, special_tokens=[unk_token] + control_tokens, continuing_subword_prefix='##') tokenizer.train(trainer, [subset_file]) # Save trained subword vocabulary in `temporary` directory and rename to # `vocab_file`. tokenizer.model.save(temporary) os.rename(os.path.join(temporary, 'vocab.txt'), vocab_file) # Remove temporary subset corpus. os.remove(subset_file)
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. if args.labels_in: labels = json.load(open(args.labels_in)) mlb = mlb.fit([labels]) else: mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. if args.labels_in: n_labels = len(json.load(open(args.labels_in))) else: n_labels = len(label_counter) out_f.write( json.dumps((examples_per_file[input_file], n_labels)) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] doc_idx_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: START_POS = int(args.window_start) / 100 for doc_idx, (example, labels) in enumerate(g): #example = ' '.join(example.split(' ')[-510:]) example_batch.append(example) labels_batch.append(labels) doc_idx_batch.append(doc_idx) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps( [block, labels, doc_idx]) + '\n') else: window = get_window(example, START_POS) assert len(window) == 512 assert all( [type(y) is int for y in window]) out_f.write( json.dumps([window, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps([block, labels, doc_idx]) + '\n') else: out_f.write( json.dumps( [get_window(example, START_POS), labels]) + '\n')
def test_instantiate(self): assert BertPreTokenizer() is not None assert isinstance(BertPreTokenizer(), PreTokenizer) assert isinstance(BertPreTokenizer(), BertPreTokenizer) assert isinstance(pickle.loads(pickle.dumps(BertPreTokenizer())), BertPreTokenizer)
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. out_f.write( json.dumps((examples_per_file[input_file], len(label_counter))) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: for example, labels in g: example_batch.append(example) labels_batch.append(labels) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write( json.dumps([example.ids, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write(json.dumps([example.ids, labels]) + '\n')
def __init__(self, vocab_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "<unk>", sep_token: Union[str, AddedToken] = "</s>", cls_token: Union[str, AddedToken] = "<s>", nl_token: Union[str, AddedToken] = "<nl>", pad_token: Union[str, AddedToken] = "<pad>", mask_token: Union[str, AddedToken] = "<mask>", clean_text: bool = True, handle_chinese_chars: bool = True, separate_numbers: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", special_chars: str = SPECIAL_CHARS, zh_norm: bool = True, handle_simpl: bool = True, do_postprocess: bool = False): if vocab_file is not None: tokenizer = Tokenizer( WordPiece(vocab_file, unk_token=str(unk_token))) else: tokenizer = Tokenizer(WordPiece()) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(nl_token)) is not None: tokenizer.add_special_tokens([str(nl_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) tokenizer.normalizer = Sequence([ NFKC(), BertNormalizer(clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, separate_numbers=separate_numbers, strip_accents=strip_accents, lowercase=lowercase, special_chars=special_chars, zh_norm=zh_norm, handle_simpl=handle_simpl) ]) tokenizer.pre_tokenizer = BertPreTokenizer() if vocab_file is not None and do_postprocess: sep_token_id = tokenizer.token_to_id(str(sep_token)) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(str(cls_token)) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)) tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "nl_token": nl_token, "pad_token": pad_token, "mask_token": mask_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "separate_numbers": separate_numbers, "strip_accents": strip_accents, "lowercase": lowercase, "special_chars": special_chars, "zh_norm": zh_norm, "handle_simpl": handle_simpl, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)