def test_encode_add_special_tokens(self, roberta_files): with pytest.deprecated_call(): tokenizer = Tokenizer( BPE(roberta_files["vocab"], roberta_files["merges"])) tokenizer.add_special_tokens(["<s>", "</s>"]) tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) tokenizer.post_processor = RobertaProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) # Can encode with special tokens output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True) assert output_with_specials.tokens == [ "<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>" ] # Can encode without special tokens output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False) assert output_without_specials.tokens == [ "ĠMy", "Ġname", "Ġis", "ĠJohn" ]
def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): try: tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) except Exception: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizer," "please note they are not compatible.".format(vocab_file) ) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] # Strip normalizer at the end normalizer += [Strip(left=True, right=True)] if len(normalizer) > 0: tokenizer.normalizer = Sequence(normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token)) ) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def __init__( self, vocab_file: Optional[str] = None, add_special_tokens: bool = True, unk_token: str = "[UNK]", sep_token: str = "[SEP]", cls_token: str = "[CLS]", clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", ): if vocab_file is not None: tokenizer = Tokenizer( WordPiece.from_files(vocab_file, unk_token=unk_token)) else: tokenizer = Tokenizer(WordPiece.empty()) tokenizer.add_special_tokens([unk_token, sep_token, cls_token]) tokenizer.normalizer = BertNormalizer( clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase, ) tokenizer.pre_tokenizer = BertPreTokenizer() if add_special_tokens and vocab_file is not None: sep_token_id = tokenizer.token_to_id(sep_token) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(cls_token) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (sep_token, sep_token_id), (cls_token, cls_token_id)) tokenizer.decoders = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "add_special_tokens": add_special_tokens, "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "strip_accents": strip_accents, "lowercase": lowercase, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
def __init__( self, vocab_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "[UNK]", pad_token: Union[str, AddedToken] = "[PAD]", mask_token: Union[str, AddedToken] = "[MASK]", lowercase: bool = False, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None: logging.info(f"Initiating tokenizer at {vocab_file}") tokenizer = Tokenizer( WordLevel(vocab=vocab_file, unk_token=unk_token)) else: tokenizer = Tokenizer(WordLevel(unk_token=unk_token)) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() parameters = { "model": "WordLevel", "unk_token": unk_token, "pad_token": pad_token, "mask_token": mask_token, "lowercase": lowercase, "unicode_normalizer": unicode_normalizer, } super().__init__(tokenizer, parameters)
def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] if len(normalizer) > 0: tokenizer.normalizer = Sequence(normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token)) ) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def train_tokenizer(langs, dataset, vocab_size): """Train a tokenizer on given list of languages. Reserves a special token for each language which is [LANG] where LANG is the language tag. These are assigned to tokens 5, 6, ..., len(langs) + 4. """ # Byte-pair encoding tokenizer = Tokenizer(BPE(unk_token='[UNK]')) # trainer lang_tokens = ['[' + lang + ']' for lang in langs] special_tokens = ['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'] + lang_tokens trainer = BpeTrainer( special_tokens=special_tokens, vocab_size=vocab_size) # normalise and pre tokenize tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() # create iterator and train iterator = _MultilingualIterator(dataset, langs) tokenizer.train_from_iterator(iterator, trainer) # post process start/end tokens tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) return tokenizer
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, unk_token: Union[str, AddedToken] = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None, ): if vocab is not None and merges is not None: tokenizer = Tokenizer( BPE(vocab, merges, dropout=dropout, unk_token=unk_token)) else: tokenizer = Tokenizer(BPE()) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) tokenizer.normalizer = NFKC() tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def train_tokenizer(lang, dataset, vocab_size): # Byte-pair encoding tokenizer = Tokenizer(BPE(unk_token='[UNK]')) # trainer trainer = BpeTrainer( special_tokens=['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'], vocab_size=vocab_size) # pre tokenizer with whitespace tokenizer.pre_tokenizer = Whitespace() # train tokenizer.train_from_iterator(dataset[lang], trainer) # post process start/end tokens tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) return tokenizer
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, no_consecutive_space: bool = True, dropout: Optional[float] = None, clean_text: bool = True, handle_chinese_chars: bool = True, separate_numbers: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", special_chars: str = SPECIAL_CHARS, zh_norm: bool = True, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)) else: tokenizer = Tokenizer(BPE()) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) tokenizer.normalizer = Sequence([ NFKC(), BertNormalizer(clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, separate_numbers=separate_numbers, strip_accents=strip_accents, lowercase=lowercase, special_chars=special_chars, zh_norm=zh_norm) ]) tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, no_consecutive_space=no_consecutive_space) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, no_consecutive_space=no_consecutive_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "no_consecutive_space": no_consecutive_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def __init__( self, batch_size: int = 1, val_batch_size: int = None, dataset=None, languages=None, tokenizer: Tokenizer = None, device='cpu', ): super(WMT20DataModule, self).__init__() self.batch_size = batch_size self.val_batch_size = val_batch_size if val_batch_size is not None else batch_size if dataset is None: raise ValueError(f"dataset is required for {self}") self.dataset = dataset if languages is None: raise ValueError(f"languages is required for {self}") self.languages = languages self.tokenizer = tokenizer pad_token = "[PAD]" self.tokenizer.enable_padding(pad_id=tokenizer.token_to_id(pad_token), pad_token=pad_token) translate_postprocessor = TemplateProcessing( single="[TRANSLATE] $0 [SEP]", special_tokens=[("[TRANSLATE]", tokenizer.token_to_id('[TRANSLATE]')), ("[SEP]", tokenizer.token_to_id('[SEP]'))], ) tokenizer.post_processor = translate_postprocessor self.device = device return
def __init__( self, vocab_file, sep_token="<sep>", cls_token="<cls>", pad_token="<pad>", mask_token="<mask>", lowercase: bool = True, ): tokenizer = Tokenizer(WordLevel(vocab_file, unk_token=unk_token)) tokenizer.normalizer = Strip() tokenizer.pre_tokenizer = CharDelimiterSplit(" ") tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) parameters = { "model": "WordLevel", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) unk_token = self.original_tokenizer.unk_token tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, unk_token=str(unk_token), end_of_word_suffix="</w>", fuse_unk=False, )) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tokenizer.decoder = decoders.BPEDecoder(suffix="</w>") return tokenizer
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: str = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)) else: tokenizer = Tokenizer(BPE.empty()) if tokenizer.token_to_id(unk_token) is not None: tokenizer.add_special_tokens([unk_token]) tokenizer.normalizer = NFKC() tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
tok_r = Tokenizer( WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)) tok_r.normalizer = BertNormalizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) # tok_r.pre_tokenizer = pre_tokenizers.Whitespace() tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tok_r.decoder = decoders.WordPiece() tok_r.post_processor = BertProcessing( ("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")), ) else: raise Exception(f"Unknown type {args.type}") def tokenize_r(): return tok_r.encode_batch(text) def tokenize_p(): return [ tok_p.encode(sentence, add_special_tokens=True) for sentence in tqdm(text) ]
vocab_path = data_path / 'vocab.json' tokenizer_tmp_path = data_path / 'tokenizer_tmp' tokenizer_path = data_path / 'tokenizer' text_tokenizer = TextTokenizer(dataset_path) text_tokenizer.load_vocab(vocab_path) vocab = text_tokenizer.vocab vocab_count = len(vocab.keys()) vocab.update({'<|endoftext|>': vocab_count}) tokenizer_tmp = Tokenizer(WordLevel(text_tokenizer.vocab)) tokenizer_tmp.pre_tokenizer = CharDelimiterSplit(' ') tokenizer_tmp.post_processor = BertProcessing( ("<|endoftext|>", tokenizer_tmp.token_to_id("<|endoftext|>")), ("<|endoftext|>", tokenizer_tmp.token_to_id("<|endoftext|>")), ) tokenizer_tmp_path.mkdir(parents=True, exist_ok=True) tokenizer_tmp.save(str(tokenizer_tmp_path / "tokenizer.json")) # Re-create as GPT2 compatible tokenizer class GPT2CompatibleTokenizer(PreTrainedTokenizerFast): def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: file = str(tokenizer_path / "tokenizer.json") tokenizer.backend_tokenizer.save(file)
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. out_f.write( json.dumps((examples_per_file[input_file], len(label_counter))) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: for example, labels in g: example_batch.append(example) labels_batch.append(labels) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write( json.dumps([example.ids, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write(json.dumps([example.ids, labels]) + '\n')
def __init__( self, target_vocab, ): special_tokens = { "pad_token": "[PAD]", "unk_token": "[UNK]", "sep_token": "[SEP]", "cls_token": "[CLS]", "mask_token": "[MASK]", } vocab = {} vocab[special_tokens["pad_token"]] = 0 tkn_idx = 1 unused_ctr = 0 # not sure whether that's relevant, but fill 1..99 and 105...999 # with unused tokens to keep BERT's tokenizer style # as a result, one can easily identify special tokens: # 0 is padding # 1xx are other special tokens # any four-digit tokens are actual payload fill_tokens = False if(fill_tokens): while(tkn_idx < 100): vocab[f"[unused{unused_ctr}]"] = tkn_idx tkn_idx += 1 unused_ctr += 1 for token in ["unk_token", "cls_token", "sep_token", "mask_token"]: vocab[special_tokens[token]] = tkn_idx tkn_idx += 1 if(fill_tokens): while(tkn_idx < 1000): vocab[f"[unused{unused_ctr}]"] = tkn_idx tkn_idx += 1 unused_ctr += 1 for word in target_vocab: vocab[word] = tkn_idx tkn_idx += 1 tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token=special_tokens["unk_token"])) tokenizer.add_special_tokens(list(special_tokens.values())) tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() sep_token_id = tokenizer.token_to_id(special_tokens["sep_token"]) cls_token_id = tokenizer.token_to_id(special_tokens["cls_token"]) tokenizer.post_processor = processors.BertProcessing( (special_tokens["sep_token"], sep_token_id), (special_tokens["cls_token"], cls_token_id) ) parameters = special_tokens parameters["model"] = "WordLevel" super().__init__(tokenizer, parameters) tokenizer.save(PRETRAINED_TOKENIZER_FILE)
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, unk_token: Union[str, AddedToken] = "[UNK]", sep_token: Union[str, AddedToken] = "[SEP]", cls_token: Union[str, AddedToken] = "[CLS]", pad_token: Union[str, AddedToken] = "[PAD]", mask_token: Union[str, AddedToken] = "[MASK]", clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: Optional[bool] = None, lowercase: bool = True, wordpieces_prefix: str = "##", ): if vocab is not None: tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token))) else: tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token))) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) tokenizer.normalizer = BertNormalizer( clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase, ) tokenizer.pre_tokenizer = BertPreTokenizer() if vocab is not None: sep_token_id = tokenizer.token_to_id(str(sep_token)) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(str(cls_token)) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)) tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "strip_accents": strip_accents, "lowercase": lowercase, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
from typing import Optional, Tuple data_path = Path('/workspace/poetry2021.gt/data/pan_tadeusz4') dataset_path = data_path / 'dataset' vocab_path = data_path / 'vocab.json' tokenizer_tmp_path = data_path / 'tokenizer_tmp' tokenizer_path = data_path / 'tokenizer' text_tokenizer = TextTokenizer(dataset_path) text_tokenizer.load_vocab(vocab_path) tokenizer_tmp = Tokenizer(WordLevel(text_tokenizer.vocab)) tokenizer_tmp.pre_tokenizer = CharDelimiterSplit(' ') tokenizer_tmp.post_processor = BertProcessing( ("</s>", tokenizer_tmp.token_to_id("</s>")), ("<s>", tokenizer_tmp.token_to_id("<s>")), ) tokenizer_tmp_path.mkdir(parents=True, exist_ok=True) tokenizer_tmp.save(str(tokenizer_tmp_path / "tokenizer.json")) # Re-create as roberta compatible tokenizer class RobertaCompatibleTokenizer(PreTrainedTokenizerFast): def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: file = str(tokenizer_path / "tokenizer.json") tokenizer.backend_tokenizer.save(file)
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. if args.labels_in: labels = json.load(open(args.labels_in)) mlb = mlb.fit([labels]) else: mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. if args.labels_in: n_labels = len(json.load(open(args.labels_in))) else: n_labels = len(label_counter) out_f.write( json.dumps((examples_per_file[input_file], n_labels)) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] doc_idx_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: START_POS = int(args.window_start) / 100 for doc_idx, (example, labels) in enumerate(g): #example = ' '.join(example.split(' ')[-510:]) example_batch.append(example) labels_batch.append(labels) doc_idx_batch.append(doc_idx) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps( [block, labels, doc_idx]) + '\n') else: window = get_window(example, START_POS) assert len(window) == 512 assert all( [type(y) is int for y in window]) out_f.write( json.dumps([window, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps([block, labels, doc_idx]) + '\n') else: out_f.write( json.dumps( [get_window(example, START_POS), labels]) + '\n')
def __init__(self, vocab_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "<unk>", sep_token: Union[str, AddedToken] = "</s>", cls_token: Union[str, AddedToken] = "<s>", nl_token: Union[str, AddedToken] = "<nl>", pad_token: Union[str, AddedToken] = "<pad>", mask_token: Union[str, AddedToken] = "<mask>", clean_text: bool = True, handle_chinese_chars: bool = True, separate_numbers: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", special_chars: str = SPECIAL_CHARS, zh_norm: bool = True, handle_simpl: bool = True, do_postprocess: bool = False): if vocab_file is not None: tokenizer = Tokenizer( WordPiece(vocab_file, unk_token=str(unk_token))) else: tokenizer = Tokenizer(WordPiece()) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(nl_token)) is not None: tokenizer.add_special_tokens([str(nl_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) tokenizer.normalizer = Sequence([ NFKC(), BertNormalizer(clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, separate_numbers=separate_numbers, strip_accents=strip_accents, lowercase=lowercase, special_chars=special_chars, zh_norm=zh_norm, handle_simpl=handle_simpl) ]) tokenizer.pre_tokenizer = BertPreTokenizer() if vocab_file is not None and do_postprocess: sep_token_id = tokenizer.token_to_id(str(sep_token)) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(str(cls_token)) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)) tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "nl_token": nl_token, "pad_token": pad_token, "mask_token": mask_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "separate_numbers": separate_numbers, "strip_accents": strip_accents, "lowercase": lowercase, "special_chars": special_chars, "zh_norm": zh_norm, "handle_simpl": handle_simpl, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
def __init__( self, vocab_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "<unk>", sep_token: Union[str, AddedToken] = "<sep>", cls_token: Union[str, AddedToken] = "<cls>", pad_token: Union[str, AddedToken] = "<pad>", mask_token: Union[str, AddedToken] = "<mask>", lowercase: bool = False, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None: tokenizer = Tokenizer(WordLevel(vocab_file)) else: tokenizer = Tokenizer(WordLevel()) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() if vocab_file is not None: sep_token_id = tokenizer.token_to_id(str(sep_token)) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(str(cls_token)) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = processors.BertProcessing( (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)) parameters = { "model": "WordLevel", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "lowercase": lowercase, "unicode_normalizer": unicode_normalizer, } super().__init__(tokenizer, parameters)
class LitTokenizer: def __init__(self, padding=False, truncation=False, max_length=None, lower=False, lang=None): super().__init__() self.UNK_WORD = '[UNK]' self.PAD_WORD = '[PAD]' self.MASK_WORD = '[MASK]' self.SOS_WORD = '[SOS]' self.EOS_WORD = '[EOS]' self.special_tokens = [ self.UNK_WORD, self.PAD_WORD, self.MASK_WORD, self.SOS_WORD, self.EOS_WORD ] # Define tokenizer self.tokenizer = None self.configure_tokenizers(padding, truncation, max_length, lower) # Other self.lang = lang def get_vocab_size(self): return self.tokenizer.get_vocab_size() def configure_tokenizers(self, padding, truncation, max_length, lower): # Settings pad_length = None if padding in {True, "longest"}: pass elif padding in {"max_length"}: pad_length = max_length elif padding in {False, "do_not_pad"}: pass else: raise ValueError("Unknown padding type") # SRC tokenizer tok_normalizers = [NFD(), Strip()] if lower: tok_normalizers += [Lowercase()] self.tokenizer = Tokenizer(tok_model()) # unk_token=... not working self.tokenizer.add_special_tokens(self.special_tokens) self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [WhitespaceSplit()]) self.tokenizer.normalizer = normalizers.Sequence( tok_normalizers) # StripAccents requires NFD self.tokenizer.decoder = tok_decoder() # Define template (Needed for the sos/eos tokens) basic_template = TemplateProcessing( single=f"{self.SOS_WORD} $A {self.EOS_WORD}", pair= f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}", special_tokens=[ (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)), (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD)) ], ) self.tokenizer.post_processor = basic_template if padding: self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id( self.PAD_WORD), pad_token=self.PAD_WORD, length=pad_length) if truncation: self.tokenizer.enable_truncation(max_length, stride=0, strategy='longest_first') def load_vocab(self, vocab, merges): vocab, merges = tok_model.read_file(vocab, merges) self.tokenizer.model = tok_model(vocab, merges) def train_vocab(self, files, vocab_size=32000, min_frequency=3): # Train trainer trainer = tok_trainer(vocab_size=vocab_size, min_frequency=min_frequency) self.tokenizer.train(files, trainer) def save_vocab(self, output_dir, prefix): self.tokenizer.model.save(output_dir, prefix) def pad(self, examples, keys=None): pad_idx = self.special_tokens.index(self.PAD_WORD) # Keys to modify if not keys: keys = list(examples[0].keys()) d = {} for k in keys: # Collect same-type items (list of IDs, list of masks,...) d[k] = [x[k] for x in examples] # Get max length (value to pad) max_length = max([x.shape[-1] for x in d[k]]) # Apply padding for i, x in enumerate(examples): unpadded_t = x[k] if k == "ids": tmp = torch.full((max_length, ), fill_value=pad_idx, device=unpadded_t.device) # All padding elif k == "attention_mask": tmp = torch.full( (max_length, ), fill_value=0, device=unpadded_t.device) # No attention mask else: raise TypeError("Unknown key") tmp[:unpadded_t.shape[-1]] = unpadded_t d[k][i] = tmp return d def encode(self, x): return self.tokenizer.encode(x) def decode(self, x): if isinstance(x, torch.Tensor): assert len(x.shape) == 2 x = x.detach().cpu().numpy() return [self.tokenizer.decode(x_i) for x_i in x]
os.environ["CUDA_VISIBLE_DEVICES"] = "2" uid_task_id_sequence_path = 'data/feature_sequence/uid_task_id.txt' paths = [str(x) for x in Path(".").glob('data/feature_sequence/*.txt')] tokenizer = Tokenizer(WordLevel()) tokenizer.pre_tokenizer = Whitespace() # trainer = trainers.BpeTrainer( trainer = trainers.WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(trainer, [uid_task_id_sequence_path]) tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) # tokenizer.save_model("tmp") tokenizer.model.save('data/bert_and_tokenizer', 'uid_task_id') # tokenizer = ByteLevelBPETokenizer( # "./tmp/vocab.json", # "./tmp/merges.txt", # ) # task id的词汇表大小 task_id_vocab_size = 6033 config = BertConfig(