def test_lowercase(self): normalizer = BertNormalizer( strip_accents=False, lowercase=True, handle_chinese_chars=False, clean_text=False ) output = normalizer.normalize_str("Héllò") assert output == "héllò"
def train_huggingface_bpetokenizers( data_params: DatasetParams, query_files: List[Path], lang_files: Dict[str, Path] ) -> Tuple[TokenizerRecordable, TokenizerRecordable]: logger.info( f"Building Query BPETokenizer from query_files {query_files} with do_lowercase:{data_params.do_lowercase} special_tokens:{data_params.special_tokens}" ) query_tokenizer = BPETokenizer() query_tokenizer.normalizer = BertNormalizer.new( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=data_params.do_lowercase) query_tokenizer.train(files=list(map(str, query_files)), vocab_size=data_params.vocab_size, special_tokens=data_params.special_tokens) code_tokenizer = BPETokenizer() code_tokenizer.normalizer = BertNormalizer.new( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=data_params.do_lowercase) code_tokenizer.train( files=list(map(str, lang_files.values())), vocab_size=data_params.vocab_size, special_tokens=data_params.special_tokens, ) return HuggingfaceBPETokenizerRecordable( query_tokenizer), HuggingfaceBPETokenizerRecordable(code_tokenizer)
def test_clean_text(self): normalizer = BertNormalizer( strip_accents=False, lowercase=False, handle_chinese_chars=False, clean_text=True ) output = normalizer.normalize_str("\ufeffHello") assert output == "Hello"
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile) -> AuxiliaryFile: subset = self._create_subset_file(afm, corpus) # Create WordPiece model with a normalizer and pre-tokenizer. Note that # BERT-specific normalizer and pre-tokenizer are used in this model. tokenizer = Tokenizer(WordPiece()) tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() # Train tokenizer model with subset of corpus. trainer = WordPieceTrainer(vocab_size=self.vocab_size, min_frequency=2, show_progress=True, limit_alphabet=self.limit_alphabet, special_tokens=[self.unk_token] + self.special_tokens, continuing_subword_prefix='##') tokenizer.train(trainer, [subset.name]) # Save trained vocabulary to an auxiliary output file. vocab = afm.create() tokenizer.model.save(os.path.dirname(vocab.name)) os.rename(os.path.join(os.path.dirname(vocab.name), 'vocab.txt'), vocab.name) return vocab
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, no_consecutive_space: bool = True, dropout: Optional[float] = None, clean_text: bool = True, handle_chinese_chars: bool = True, separate_numbers: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", special_chars: str = SPECIAL_CHARS, zh_norm: bool = True, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)) else: tokenizer = Tokenizer(BPE()) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) tokenizer.normalizer = Sequence([ NFKC(), BertNormalizer(clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, separate_numbers=separate_numbers, strip_accents=strip_accents, lowercase=lowercase, special_chars=special_chars, zh_norm=zh_norm) ]) tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, no_consecutive_space=no_consecutive_space) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, no_consecutive_space=no_consecutive_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "no_consecutive_space": no_consecutive_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def test_strip_accents(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = BertNormalizer( strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False ) output = tokenizer.normalize("Héllò") assert output == "Hello"
def __init__( self, vocab_file: Optional[str] = None, add_special_tokens: bool = True, unk_token: str = "[UNK]", sep_token: str = "[SEP]", cls_token: str = "[CLS]", clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", ): if vocab_file is not None: tokenizer = Tokenizer( WordPiece.from_files(vocab_file, unk_token=unk_token)) else: tokenizer = Tokenizer(WordPiece.empty()) tokenizer.add_special_tokens([unk_token, sep_token, cls_token]) tokenizer.normalizer = BertNormalizer( clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase, ) tokenizer.pre_tokenizer = BertPreTokenizer() if add_special_tokens and vocab_file is not None: sep_token_id = tokenizer.token_to_id(sep_token) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(cls_token) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (sep_token, sep_token_id), (cls_token, cls_token_id)) tokenizer.decoders = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "add_special_tokens": add_special_tokens, "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "strip_accents": strip_accents, "lowercase": lowercase, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
def tokenize_corpus( input_file: str, output_file: str, vocab_file: str, unk_token: str = '<unk>', control_tokens: List[str] = []): r"""Tokenize corpus sentences through trained **WordPiece** model. Arguments: input_file (str): Input corpus file path. output_file (str): Output file path. vocab_file (str): Trained vocabulary file path. unk_token (str): Unknown token in the vocabulary. control_tokens (list): Control tokens in the vocabulary. """ # Create `WordPiece` model and add special tokens. Note that `unk_token` # is also a special token.normalizer and pre-tokenizer. tokenizer = Tokenizer(models.WordPiece(vocab_file, unk_token=unk_token)) tokenizer.add_special_tokens([unk_token] + control_tokens) # Use BERT-specific normalizer, pre-tokenizer and **WordPiece** decoder. tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = decoders.WordPiece(prefix='##') with open(input_file, 'r', encoding='utf-8') as src, \ open(output_file, 'w', encoding='utf-8') as dst: # Count total lines in corpus. total_lines = 0 for _ in src: total_lines += 1 # Move the corpus file to first. src.seek(0) buffer = [] for line in tqdm.tqdm(src, desc='[*] tokenize corpus', total=total_lines): buffer.append(line) # Tokenize buffered sentences and write to `output_file`. if len(buffer) > 10000: for t in tokenizer.encode_batch(buffer): dst.write(' '.join(t.tokens) + '\n') buffer.clear() # Process the remained buffer. if buffer: for t in tokenizer.encode_batch(buffer): dst.write(' '.join(t.tokens) + '\n')
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Optional[str] = "<unk>", suffix: Optional[str] = "</w>", dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] # OpenAI normalization is the same as Bert normalizers += [BertNormalizer()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = BPEDecoder(suffix=suffix) parameters = { "model": "BPE", "unk_token": unk_token, "suffix": suffix, "dropout": dropout, } super().__init__(tokenizer, parameters)
def build(self, afm: AuxiliaryFileManager, corpus: AuxiliaryFile, vocab: AuxiliaryFile) -> AuxiliaryFile: total_lines = self._total_lines_in_file(corpus) # Create WordPiece model and add special tokens. Note that `unk_token` # is also a special token. tokenizer = Tokenizer(WordPiece(vocab.name, unk_token=self.unk_token)) tokenizer.add_special_tokens(self.special_tokens + [self.unk_token]) # Use BERT-specific normalizer, pre-tokenizer and decoder. tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = WordPieceDecoder(prefix='##') tokenized = afm.create() with corpus.open('r') as src, tokenized.open('w') as dst: # Create tqdm progress bar with colorful description. tqdm_iter = tqdm.tqdm(src, desc=colorful.render( '<r>[*]</r> tokenize sentences with ' '<g>WordPiece</g> model'), total=total_lines) batch_lines = [] for line in tqdm_iter: batch_lines.append(line) # Encode the grouped batch sentences and write the tokenized # sentences to the auxiliary output file. if len(batch_lines) > self.batch_size: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') batch_lines.clear() # Encode the remainders and write to the output file. if batch_lines: for t in tokenizer.encode_batch(batch_lines): dst.write(' '.join(t.tokens) + '\n') return tokenized
def __init__(self, vocab: Dict[str, int] = None, merges: List[Tuple[str, str]] = None, dropout: float = None, max_length: Optional[int] = 64) -> None: """Constructor Args: vocab (Dict[str, int]): A dictionary of string keys and their ids. merges (List[Tuple[str, str]]): A list of pairs of tokens. dropout (float): BPE dropout max_length (int, optional): The max length at which to truncate. Defaults to `64`. """ self.tokenizer = Tokenizer( BPE(vocab, merges, dropout=dropout, unk_token=self.unk_token)) self.tokenizer.normalizer = BertNormalizer() # noqa self.tokenizer.pre_tokenizer = pre_tokenizers.Metaspace() # noqa self.tokenizer.decoder = decoders.Metaspace() # noqa self.tokenizer.add_special_tokens([self.pad_token, self.unk_token]) self.tokenizer.enable_padding(pad_token=self.pad_token) self.tokenizer.enable_truncation(max_length)
def test_can_modify(self): normalizer = BertNormalizer(clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True) assert normalizer.clean_text == True assert normalizer.handle_chinese_chars == True assert normalizer.strip_accents == True assert normalizer.lowercase == True # Modify these normalizer.clean_text = False assert normalizer.clean_text == False normalizer.handle_chinese_chars = False assert normalizer.handle_chinese_chars == False normalizer.strip_accents = None assert normalizer.strip_accents == None normalizer.lowercase = False assert normalizer.lowercase == False
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. out_f.write( json.dumps((examples_per_file[input_file], len(label_counter))) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: for example, labels in g: example_batch.append(example) labels_batch.append(labels) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write( json.dumps([example.ids, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write(json.dumps([example.ids, labels]) + '\n')
def train_tokenizer( input_file: str, vocab_file: str, temporary: str, subset_size: int = 512000000, vocab_size: int = 8000, limit_alphabet: int = 6000, unk_token: str = '<unk>', control_tokens: List[str] = []): r"""Train **WordPiece** tokenizer and save trained subword vocabulary. Note: Since tokenizers_ reads whole file data in training, this function could occur memory errors if `input_file` is too large. Under the assumption that `input_file` is shuffled randomly, the subset of input corpus will be used in training. Caution: The subset of input corpus is saved in `temporary` directory. Please be careful not to delete the file while executing this function. Arguments: input_file (str): Input file path. vocab_file (str): Output vocabulary file path. temporary (str): Temporary directory where the subset of corpus would be saved. subset_size (int): The maximum number of lines in the subset. vocab_size (int): The number of subwords in the vocabulary. limit_alphabet (int): The maximum number of alphabets in vocabulary. unk_tokens (str): Unknown token in the vocabulary. control_tokens (list): Control tokens in the vocabulary. .. _tokenizers: https://github.com/huggingface/tokenizers """ # Create **WordPiece** model and add normalizer and pre-tokenizer. # BERT-specific normalizer and pre-tokenizer are used. tokenizer = Tokenizer(models.WordPiece()) tokenizer.normalizer = BertNormalizer(strip_accents=False) tokenizer.pre_tokenizer = BertPreTokenizer() # Split the head of input corpus file and save in `temporary` directory. subset_file = random_filename(temporary) _split_subset_from_file(input_file, subset_file, subset_size) # Train the model with splitted subset of corpus. trainer = WordPieceTrainer(vocab_size=vocab_size, min_frequency=2, show_progress=True, limit_alphabet=limit_alphabet, special_tokens=[unk_token] + control_tokens, continuing_subword_prefix='##') tokenizer.train(trainer, [subset_file]) # Save trained subword vocabulary in `temporary` directory and rename to # `vocab_file`. tokenizer.model.save(temporary) os.rename(os.path.join(temporary, 'vocab.txt'), vocab_file) # Remove temporary subset corpus. os.remove(subset_file)
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. if args.labels_in: labels = json.load(open(args.labels_in)) mlb = mlb.fit([labels]) else: mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. if args.labels_in: n_labels = len(json.load(open(args.labels_in))) else: n_labels = len(label_counter) out_f.write( json.dumps((examples_per_file[input_file], n_labels)) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] doc_idx_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: START_POS = int(args.window_start) / 100 for doc_idx, (example, labels) in enumerate(g): #example = ' '.join(example.split(' ')[-510:]) example_batch.append(example) labels_batch.append(labels) doc_idx_batch.append(doc_idx) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps( [block, labels, doc_idx]) + '\n') else: window = get_window(example, START_POS) assert len(window) == 512 assert all( [type(y) is int for y in window]) out_f.write( json.dumps([window, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps([block, labels, doc_idx]) + '\n') else: out_f.write( json.dumps( [get_window(example, START_POS), labels]) + '\n')
def __init__(self, vocab_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "<unk>", sep_token: Union[str, AddedToken] = "</s>", cls_token: Union[str, AddedToken] = "<s>", nl_token: Union[str, AddedToken] = "<nl>", pad_token: Union[str, AddedToken] = "<pad>", mask_token: Union[str, AddedToken] = "<mask>", clean_text: bool = True, handle_chinese_chars: bool = True, separate_numbers: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", special_chars: str = SPECIAL_CHARS, zh_norm: bool = True, handle_simpl: bool = True, do_postprocess: bool = False): if vocab_file is not None: tokenizer = Tokenizer( WordPiece(vocab_file, unk_token=str(unk_token))) else: tokenizer = Tokenizer(WordPiece()) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(nl_token)) is not None: tokenizer.add_special_tokens([str(nl_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) tokenizer.normalizer = Sequence([ NFKC(), BertNormalizer(clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, separate_numbers=separate_numbers, strip_accents=strip_accents, lowercase=lowercase, special_chars=special_chars, zh_norm=zh_norm, handle_simpl=handle_simpl) ]) tokenizer.pre_tokenizer = BertPreTokenizer() if vocab_file is not None and do_postprocess: sep_token_id = tokenizer.token_to_id(str(sep_token)) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(str(cls_token)) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)) tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "nl_token": nl_token, "pad_token": pad_token, "mask_token": mask_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "separate_numbers": separate_numbers, "strip_accents": strip_accents, "lowercase": lowercase, "special_chars": special_chars, "zh_norm": zh_norm, "handle_simpl": handle_simpl, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
for v in words: print(v) # Start vocabulary with all standard special tokens. (PAD=0!) vocab = {} for special_token in ["[PAD]", "[CLS]", "[SEP]", "[UNK]", "[MASK]", "[BOS]", "[EOS]"]: vocab[special_token] = len(vocab) # Add other words - if not already present. for w in words: if w not in vocab: vocab[w] = len(vocab) print(vocab) # New tokenizer. init_tokenizer = BertWordPieceTokenizer(vocab=vocab) init_tokenizer.normalizer = Sequence([Replace("(", " ( "), Replace(")", " ) "), BertNormalizer()]) init_tokenizer.pre_tokenizer = Whitespace() #init_tokenizer.pad_token_id = vocab["[PAD]"] #print("Created tokenizer: ", init_tokenizer) # Save the created tokenizer. init_tokenizer.save(decoder_tokenizer_path) print("Tokenizer saved to: ", decoder_tokenizer_path) # Load from tokenizer file. tokenizer = PreTrainedTokenizerFast(tokenizer_file=decoder_tokenizer_path) tokenizer.add_special_tokens({'pad_token': '[PAD]', 'cls_token': '[CLS]', 'sep_token': '[SEP]', 'unk_token': '[UNK]', 'mask_token': '[MASK]', 'bos_token': '[BOS]', 'eos_token': '[EOS]' }) print(f"\nLoaded tokenizer vocabulary ({len(tokenizer.get_vocab())}):\n" + "-"*50)
import string, re from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents, Strip, BertNormalizer normalizer = normalizers.Sequence([BertNormalizer(), Strip()]) def delete_punct(w: str) -> str: """Delete all puctuation in a string.""" return w.lower().translate( str.maketrans(string.punctuation, len(string.punctuation) * " ")) def normalize(x): y = normalizer.normalize_str(delete_punct(x)) y = y.replace("\n", " ") # remove double spaces y = re.sub(' +', ' ', y).strip() return y def get_str(x): res = '' if isinstance(x, dict): for f in x: if f not in ['lang']: res += ' ' + get_str(x[f]) if isinstance(x, str): res = x.strip() if isinstance(x, list):
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, unk_token: Union[str, AddedToken] = "[UNK]", sep_token: Union[str, AddedToken] = "[SEP]", cls_token: Union[str, AddedToken] = "[CLS]", pad_token: Union[str, AddedToken] = "[PAD]", mask_token: Union[str, AddedToken] = "[MASK]", clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: Optional[bool] = None, lowercase: bool = True, wordpieces_prefix: str = "##", ): if vocab is not None: tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token))) else: tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token))) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) tokenizer.normalizer = BertNormalizer( clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase, ) tokenizer.pre_tokenizer = BertPreTokenizer() if vocab is not None: sep_token_id = tokenizer.token_to_id(str(sep_token)) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(str(cls_token)) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)) tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "strip_accents": strip_accents, "lowercase": lowercase, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
''' 将句子中的中文和英文分开,使用huggingface/tokenizers https://github.com/huggingface/tokenizers/blob/master/bindings/python/tests/bindings/test_normalizers.py ''' from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.normalizers import BertNormalizer text = "薛定谔的猫(英文名称:Erwin Schrödinger's Cat)是奥地利著名物理学家薛定谔" tokenizer = Tokenizer(BPE()) tokenizer.normalizer = BertNormalizer(strip_accents=False, lowercase=False, handle_chinese_chars=True, clean_text=False) output = tokenizer.normalize(txt) print(output)
def test_instantiate(self): assert isinstance(BertNormalizer(), Normalizer) assert isinstance(BertNormalizer(), BertNormalizer) assert isinstance(pickle.loads(pickle.dumps(BertNormalizer())), BertNormalizer)
# Initialize a new tokenizer with "frozen" vocabulary. #tokenizer = Tokenizer(BPE()) #tokenizer.normalizer = Lowercase() #tokenizer.pre_tokenizer = CharDelimiterSplit(' ') init_tokenizer = BertWordPieceTokenizer(vocab=vocab) #special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) <- wrong keyword #bos_token = "[CLS]", eos_token = "[SEP]", unk_token = "[UNK]", sep_token = "[SEP]", <- wrong keywords #pad_token = "[PAD]", cls_token = "[CLS]", mask_token = "[MASK]", <- wrong keywords init_tokenizer.normalizer = Sequence( [Replace("(", " ( "), Replace(")", " ) "), BertNormalizer()]) init_tokenizer.pre_tokenizer = Whitespace() #init_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) <- must be list #init_tokenizer.add_special_tokens(["[PAD]", "[CLS]", "[SEP]", "[UNK]", "[MASK]", "[BOS]", "[EOS]"]) <- doesn't work #init_tokenizer.add_special_tokens(['[PAD]']) # <- doesn't work #init_tokenizer.pad_token = "[PAD]" # <- doesn't work init_tokenizer.pad_token_id = vocab["[PAD]"] # "Set" special tokens? #init_tokenizer.bos_token_id = vocab["[CLS]"] #init_tokenizer.eos_token_id = vocab["[SEP]"] #init_tokenizer.unk_token_id = vocab["[UNK]"] #init_tokenizer.sep_token_id = vocab["[SEP]"] #init_tokenizer.pad_token_id = vocab["[PAD]"] #init_tokenizer.cls_token_id = vocab["[CLS]"]
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges)) # Use ByteLevel PreTokenizer tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) # Use ByteLevel Decoder tok_r.decoder = decoders.ByteLevel() elif args.type == "bert": print("Running Bert tokenizer") tok_p = BertTokenizer.from_pretrained(args.vocab) tok_r = Tokenizer( WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)) tok_r.normalizer = BertNormalizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) # tok_r.pre_tokenizer = pre_tokenizers.Whitespace() tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tok_r.decoder = decoders.WordPiece() tok_r.post_processor = BertProcessing( ("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")), ) else: raise Exception(f"Unknown type {args.type}") def tokenize_r(): return tok_r.encode_batch(text)
def test_instantiate(self): assert isinstance(BertNormalizer(), Normalizer) assert isinstance(BertNormalizer(), BertNormalizer)