def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single= f"{cls}:2 $A:0 {sep}:0", # token_type_id is 2 for Funnel transformer pair=f"{cls}:2 $A:0 {sep}:0 $B:1 {sep}:1", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def converted(self) -> Tokenizer: from .models.roformer.tokenization_utils import JiebaPreTokenizer vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=False, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.PreTokenizer.custom( JiebaPreTokenizer(vocab)) cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def train_new_from_iterator(self, text_iterator, s_vocab, new_special_tokens=None, special_tokens_map=None, **kw): tokenizer_json = json.loads(self._tokenizer.to_str()) # Remove added tokens for now (uses IDs of tokens) added_tokens = tokenizer_json.pop("added_tokens") # Remove post processor for now (uses IDs of tokens) post_processor = tokenizer_json.pop("post_processor") unk = None # Remove vocab if tokenizer_json["model"]["type"] == "BPE": tokenizer_json["model"]["vocab"] = {} tokenizer_json["model"]["merges"] = [] elif tokenizer_json["model"]["type"] == "Unigram": if tokenizer_json["model"]["unk_id"] is not None: unk_id = tokenizer_json["model"]["unk_id"] unk = tokenizer_json["model"]["vocab"][unk_id][0] if special_tokens_map is not None and unk in special_tokens_map: unk = special_tokens_map[unk] tokenizer_json["model"]["unk_id"] = 0 tokenizer_json["model"]["vocab"] = [[unk, 0.0]] elif tokenizer_json["model"]["type"] in ["WordLevel", "WordPiece"]: tokenizer_json["model"]["vocab"] = {} else: raise ValueError( f"This method does not support this type of tokenizer (found {tokenizer_json['model']['type']}) " "only BPE, Unigram, WordLevel and WordPiece.") if (special_tokens_map is not None and "unk" in tokenizer_json["model"] and tokenizer_json["model"]["unk"] in special_tokens_map): tokenizer_json["model"]["unk"] = special_tokens_map[ tokenizer_json["model"]["unk"]] tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json)) # Get the special tokens from the current tokenizer if none are specified. special_tokens = [] for added_token in added_tokens: special = added_token.pop("special", None) _ = added_token.pop("id", None) if tokenizer_json["model"]["type"] != "Unigram" and not special: continue if special_tokens_map is not None and added_token[ "content"] in special_tokens_map: added_token["content"] = special_tokens_map[ added_token["content"]] special_tokens.append(AddedToken(**added_token)) if new_special_tokens is not None: special_tokens.extend(new_special_tokens) # Trainer needs to know the end of word / continuing subword thingies in BPE if (tokenizer_json["model"]["type"] == "BPE" and "continuing_subword_prefix" not in kw and tokenizer_json["model"]["continuing_subword_prefix"] is not None): kw["continuing_subword_prefix"] = tokenizer_json["model"][ "continuing_subword_prefix"] if (tokenizer_json["model"]["type"] == "BPE" and "end_of_word_suffix" not in kw and tokenizer_json["model"]["end_of_word_suffix"] is not None): kw["end_of_word_suffix"] = tokenizer_json["model"][ "end_of_word_suffix"] if tokenizer_json["model"]["type"] == "Unigram" and unk is not None: kw["unk"] = unk trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"] ["type"]] trainer = trainer_class(s_vocab=s_vocab, special_tokens=special_tokens, **kw) tokenizer.train_from_iterator(text_iterator, trainer=trainer) if post_processor is not None: trained_tokenizer_json = json.loads(tokenizer.to_str()) # Almost done, we just have to adjust the token IDs in the post processor if "special_tokens" in post_processor: for key in post_processor["special_tokens"]: tokens = post_processor["special_tokens"][key]["tokens"] if special_tokens_map is not None: tokens = [ special_tokens_map.get(token, token) for token in tokens ] post_processor["special_tokens"][key]["tokens"] = tokens post_processor["special_tokens"][key]["ids"] = [ tokenizer.token_to_id(token) for token in tokens ] for special_token in ["cls", "sep"]: if special_token in post_processor: token, _ = post_processor[special_token] if special_tokens_map is not None and token in special_tokens_map: token = special_tokens_map[token] token_id = tokenizer.token_to_id(token) post_processor[special_token] = [token, token_id] trained_tokenizer_json["post_processor"] = post_processor tokenizer = TokenizerFast.from_str( json.dumps(trained_tokenizer_json)) kw = self.init_kw.copy() # Map pad/cls/mask token at the Transformers level special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy( ) special_tokens_list.remove("additional_special_tokens") for token in special_tokens_list: # Get the private one to avoid unnecessary warnings. if getattr(self, f"_{token}") is not None: special_token = getattr(self, token) if special_tokens_map is not None and special_token in special_tokens_map: special_token = special_tokens_map[special_token] special_token_full = getattr(self, f"_{token}") if isinstance(special_token_full, AddedToken): # Create an added token with the same parameters except the content kw[token] = AddedToken( special_token, single_word=special_token_full.single_word, lstrip=special_token_full.lstrip, rstrip=special_token_full.rstrip, normalized=special_token_full.normalized, ) else: kw[token] = special_token additional_special_tokens = self.additional_special_tokens if new_special_tokens is not None: additional_special_tokens.extend(new_special_tokens) if len(additional_special_tokens) > 0: kw["additional_special_tokens"] = additional_special_tokens return self.__class__(tokenizer_object=tokenizer, **kw)
def test_from_pretrained(self): tokenizer = Tokenizer.from_pretrained("bert-base-cased") output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False) assert output.tokens == ["Hey", "there", "dear", "friend", "!"]
def test_post_process(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.enable_truncation(2) tokenizer.enable_padding(length=4) encoding = tokenizer.encode("my name is john") pair_encoding = tokenizer.encode("pair") # Can post process a single encoding output = tokenizer.post_process(encoding) assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] # Can post process a pair of encodings output = tokenizer.post_process(encoding, pair_encoding) assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
def test_truncation(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.enable_truncation(2) # Can truncate single sequences output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name"] # Can truncate pair sequences as well output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "pair"] # Can get the params and give them to enable_truncation trunc = tokenizer.truncation tokenizer.enable_truncation(**trunc) # Left truncation direction tokenizer.enable_truncation(2, direction="left") output = tokenizer.encode("my name is john") assert output.tokens == ["is", "john"] output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["john", "pair"]
concepts = set([c.lower() for i in data for c in i['concept'].split("|")]) print(f"Unique concepts: {len(concepts)}\n") # name = mention + concept names = unique([mentions + list(concepts)], verbose=False) - stop_words # names = unique([mentions + list(concepts)], verbose=False) print(f"Unique names: {len(names)}\n") name_words = {n: " ".join(split_to_words(n)) for n in names} with open(f"{proc_path}/names.txt", "w") as f: f.write("\n".join(list(name_words.values()))) # f.write("\n".join(words)) tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ # NFKC(), Lowercase() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer(vocab_size=int(vocab_size), show_progress=True) tokenizer.train(trainer, [f"{proc_path}/names.txt"]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) tokenizer.model.save(proc_path) tokenizer.model = BPE.from_file(f'{proc_path}/vocab.json', f'{proc_path}/merges.txt')
def fetch_encoder(config: EncoderConfig): if config.is_pretrained: return GPT2TokenizerFast.from_pretrained(config.location) return Tokenizer.from_file(config.location)
def test_full_serialization_albert(self, albert_base): # Check we can read this file. # This used to fail because of BufReader that would fail because the # file exceeds the buffer capacity tokenizer = Tokenizer.from_file(albert_base)
for line in tqdm(fin): dp = json.loads(line.strip()) for d in enumerate(dp): if "value" in d: if "," in d["value"]: print('Not cleaned up') # Extract value/types from trees and store in comma separated raw file (all_raw.json) with open("output/all_new_trees.json") as fin, open("output/all_raw.json", "w") as fout: for i, line in enumerate(tqdm(fin)): dp = json.loads(line) token_list = [] for d in dp: if "value" in d: token_list.append(d["value"]) elif "type" in d: token_list.append(d["type"]) raw = ",".join(token_list) print(json.dumps(raw), file=fout) # Train tokenizer on raw file tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter=",") trainer = WordPieceTrainer(special_tokens=["[UNK]", "[PAD]"]) tokenizer.train(["output/all_raw.json"], trainer) tokenizer.save("output/tokenizer.json")
import tensorflow as tf import gpu_check from preprocessing_data import create_training_data from tokenizers import Tokenizer from model_chatbot import seq2seq from Hyper_parameter import (VOCAB_SIZE, MAXLEN, EPOCHS, SAVE_AT, LEARNING_RATE, BATCH_SIZE, VERBOSE, LOSS) tokenizer = Tokenizer() encoder_input_data, decoder_input_data, decoder_output_data = create_training_data() # parsing the dataset and creating conversation pairs encoder_input_data, decoder_input_data, decoder_output_data = tokenizer.tokenize_and_pad_training_data(encoder_input_data, decoder_input_data, decoder_output_data) # tokenizing and padding those pairs tokenizer.save_tokenizer(f'tokenizer-vocab_size-{VOCAB_SIZE}') # saving tokenizer for layer use Seq2SeqModel = seq2seq() # creating the seq2seq model optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, clipnorm=1.0, clipvalue=0.5) Seq2SeqModel.compile(optimizer=optimizer, loss=LOSS, metrics=['accuracy']) Seq2SeqModel.summary() def train(model, encoder_input_data, decoder_input_data, decoder_output_data, epochs, batch_size, verbose, save_at):
def main(): parser = ArgumentParser() parser.add_argument('lang', choices=['nld', 'ita']) parser.add_argument('models', nargs='+') parser.add_argument('--src', default='small', choices=['full', 'small']) parser.add_argument('--file', default='full') parser.add_argument('-n', default=5, type=int) parser.add_argument('-f', '--force', action='store_true') args = parser.parse_args() base_path = Path( 'data') / args.lang / 'evaluation' / 'examples' / args.src / args.file src_path = base_path / 'gold.txt' if not src_path.exists(): print(f' > gold path {src_path} does not exist') exit(1) print(' > loading tokenizer') os.environ['TOKENIZERS_PARALLELISM'] = 'false' if args.lang == 'ita': tokenizer = GPT2TokenizerFast.from_pretrained( 'LorenzoDeMattei/GePpeTto') else: tokenizer_path = Path( 'data') / args.lang / 'vocabularies' / 'tokenizer.json' tokenizer = Tokenizer.from_file(str(tokenizer_path)) args.n += 1 print(f' > loading examples from {src_path}') examples = [] with open(src_path) as f: for line in f: token_ids = tokenizer.encode(line.strip()) if type(token_ids) != list: token_ids = [0] + token_ids.ids examples.append(token_ids[:args.n]) print(f' > loaded {len(examples)} examples') for model_name in args.models: tgt_path = base_path / f'{model_name.replace("/", "_")}.txt' if not args.force and tgt_path.exists(): print(f'{tgt_path} already exists. skipping') continue model_path = Path('data') / args.lang / 'models' / model_name if not model_path.exists(): model_path = model_name print(f' > loading model {model_path}') model = GPT2LMHeadModel.from_pretrained(model_path).cuda() model.eval() print(' > generating endings for examples') generated = [ generate(input_ids, model, tokenizer) for input_ids in tqdm(examples, ncols=80) ] with open(tgt_path, 'w') as f: f.writelines(generated) print(f'\nsaved to {tgt_path}')
import pandas as pd from tokenizers import Tokenizer from tokenizers.models import BPE, WordPiece, Unigram from tokenizers.normalizers import Lowercase from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer TRAIN_DATA_PATH = 'data/data_fusion_train.parquet' OUTPUT_PATH = 'data/tokenizers/' # Prepare data train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name']) item_names = train.item_name.drop_duplicates().tolist() # WordPiece tokenizer tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=70000) tokenizer.train_from_iterator(item_names, trainer) tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json')) # BPE tokenizer tokenizer = Tokenizer(BPE(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = BpeTrainer(
from tokenizers import Tokenizer import sys import pickle import numpy as np from build_bpe import cleanup import os tokenizer = Tokenizer.from_file("bpe-fi.tokenizer.json") print(tokenizer) #dfolder = "../../Data/wiki/fi/" dfolder = "../../Data/finovels/" files = os.listdir(dfolder) print("Read files from", dfolder) print("...") #s = open(dpath).read().lower() lines = [] for dpath in files: with open(dfolder + dpath) as f: print("File:", dpath) for line in f: clean_line = cleanup(line) lines.append(clean_line) #print("Encode", s[:100], len(s)) print("ENCODE") encoded_l = tokenizer.encode_batch(lines)
import tensorflow as tf import numpy as np from tokenizers import ByteLevelBPETokenizer as Tokenizer from transformers import RobertaConfig as Config import re PATH = 'roberta-base' MAX_SEQUENCE_LENGTH = 192 TOKENIZER = Tokenizer(vocab_file="roberta/vocab.json", merges_file="roberta/merges.txt", lowercase=True, add_prefix_space=True) def preprocess(tweet, selected_text, sentiment, training=True): """ Will be used in tf.data.Dataset.from_generator(...) """ # The original strings have been converted to # byte strings, so we need to decode it tweet = tweet.decode('utf-8') selected_text = selected_text.decode('utf-8') sentiment = sentiment.decode('utf-8') # Clean up the strings a bit tweet = " ".join(str(tweet).split()) selected_text = " ".join(str(selected_text).split())
def train_new_from_iterator( self, text_iterator, vocab_size, length=None, new_special_tokens=None, special_tokens_map=None, **kwargs, ): """ Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline) as the current one. Args: text_iterator (generator of `List[str]`): The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts if you have everything in memory. vocab_size (`int`): The size of the vocabulary you want for your tokenizer. length (`int`, *optional*): The total number of sequences in the iterator. This is used to provide meaningful progress tracking new_special_tokens (list of `str` or `AddedToken`, *optional*): A list of new special tokens to add to the tokenizer you are training. special_tokens_map (`Dict[str, str]`, *optional*): If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special token name to new special token name in this argument. kwargs: Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library. Returns: [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on `text_iterator`. """ tokenizer_json = json.loads(self._tokenizer.to_str()) # Remove added tokens for now (uses IDs of tokens) added_tokens = tokenizer_json.pop("added_tokens") # Remove post processor for now (uses IDs of tokens) post_processor = tokenizer_json.pop("post_processor") unk_token = None # Remove vocab if tokenizer_json["model"]["type"] == "BPE": tokenizer_json["model"]["vocab"] = {} tokenizer_json["model"]["merges"] = [] elif tokenizer_json["model"]["type"] == "Unigram": if tokenizer_json["model"]["unk_id"] is not None: unk_id = tokenizer_json["model"]["unk_id"] unk_token = tokenizer_json["model"]["vocab"][unk_id][0] if special_tokens_map is not None and unk_token in special_tokens_map: unk_token = special_tokens_map[unk_token] tokenizer_json["model"]["unk_id"] = 0 tokenizer_json["model"]["vocab"] = [[unk_token, 0.0]] elif tokenizer_json["model"]["type"] in ["WordLevel", "WordPiece"]: tokenizer_json["model"]["vocab"] = {} else: raise ValueError( f"This method does not support this type of tokenizer (found {tokenizer_json['model']['type']}) " "only BPE, Unigram, WordLevel and WordPiece.") if (special_tokens_map is not None and "unk_token" in tokenizer_json["model"] and tokenizer_json["model"]["unk_token"] in special_tokens_map): tokenizer_json["model"]["unk_token"] = special_tokens_map[ tokenizer_json["model"]["unk_token"]] tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json)) # Get the special tokens from the current tokenizer if none are specified. special_tokens = [] for added_token in added_tokens: special = added_token.pop("special", None) _ = added_token.pop("id", None) if tokenizer_json["model"]["type"] != "Unigram" and not special: continue if special_tokens_map is not None and added_token[ "content"] in special_tokens_map: added_token["content"] = special_tokens_map[ added_token["content"]] special_tokens.append(AddedToken(**added_token)) if new_special_tokens is not None: special_tokens.extend(new_special_tokens) # Trainer needs to know the end of word / continuing subword thingies in BPE if (tokenizer_json["model"]["type"] == "BPE" and "continuing_subword_prefix" not in kwargs and tokenizer_json["model"]["continuing_subword_prefix"] is not None): kwargs["continuing_subword_prefix"] = tokenizer_json["model"][ "continuing_subword_prefix"] if (tokenizer_json["model"]["type"] == "BPE" and "end_of_word_suffix" not in kwargs and tokenizer_json["model"]["end_of_word_suffix"] is not None): kwargs["end_of_word_suffix"] = tokenizer_json["model"][ "end_of_word_suffix"] if tokenizer_json["model"][ "type"] == "Unigram" and unk_token is not None: kwargs["unk_token"] = unk_token trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"] ["type"]] trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs) tokenizer.train_from_iterator(text_iterator, length=length, trainer=trainer) if post_processor is not None: trained_tokenizer_json = json.loads(tokenizer.to_str()) # Almost done, we just have to adjust the token IDs in the post processor if "special_tokens" in post_processor: for key in post_processor["special_tokens"]: tokens = post_processor["special_tokens"][key]["tokens"] if special_tokens_map is not None: tokens = [ special_tokens_map.get(token, token) for token in tokens ] post_processor["special_tokens"][key]["tokens"] = tokens post_processor["special_tokens"][key]["ids"] = [ tokenizer.token_to_id(token) for token in tokens ] for special_token in ["cls", "sep"]: if special_token in post_processor: token, _ = post_processor[special_token] if special_tokens_map is not None and token in special_tokens_map: token = special_tokens_map[token] token_id = tokenizer.token_to_id(token) post_processor[special_token] = [token, token_id] trained_tokenizer_json["post_processor"] = post_processor tokenizer = TokenizerFast.from_str( json.dumps(trained_tokenizer_json)) kwargs = self.init_kwargs.copy() # Map pad/cls/mask token at the Transformers level special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy( ) special_tokens_list.remove("additional_special_tokens") for token in special_tokens_list: # Get the private one to avoid unnecessary warnings. if getattr(self, f"_{token}") is not None: special_token = getattr(self, token) if special_tokens_map is not None and special_token in special_tokens_map: special_token = special_tokens_map[special_token] special_token_full = getattr(self, f"_{token}") if isinstance(special_token_full, AddedToken): # Create an added token with the same parameters except the content kwargs[token] = AddedToken( special_token, single_word=special_token_full.single_word, lstrip=special_token_full.lstrip, rstrip=special_token_full.rstrip, normalized=special_token_full.normalized, ) else: kwargs[token] = special_token additional_special_tokens = self.additional_special_tokens if new_special_tokens is not None: additional_special_tokens.extend(new_special_tokens) if len(additional_special_tokens) > 0: kwargs["additional_special_tokens"] = additional_special_tokens return self.__class__(tokenizer_object=tokenizer, **kwargs)
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. out_f.write( json.dumps((examples_per_file[input_file], len(label_counter))) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: for example, labels in g: example_batch.append(example) labels_batch.append(labels) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write( json.dumps([example.ids, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write(json.dumps([example.ids, labels]) + '\n')
return torch.tensor(self.examples[i]) configuration = BertConfig() model = BertModel(configuration) configuration = model.config #tokenizer = Tokenizer(BPE(unk_token="[UNK]")) #trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) #tokenizer.pre_tokenizer = Whitespace() #files = ['./processed_wiki_ko.txt'] #tokenizer.train(files=files, trainer=trainer) #tokenizer = Tokenizer.from_file("./wiki_tokenizer.json") #fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="wiki_tokenizer.json") tokenizer = Tokenizer.from_file("./wiki_tokenizer.json") tokenizer.enable_truncation(max_length=512) #tokenizer._tokenizer.post_processor = BertProcessing( # single="[CLS] $A [SEP]", # pair="[CLS] $A [SEP] $B:1 [SEP]:1", # special_tokens=[ # ("[CLS]", tokenizer.token_to_id("[CLS]")), # ("[SEP]", tokenizer.token_to_id("[SEP]")), # ], #) tokenizer.post_processor = BertProcessing(sep=("[SEP]", tokenizer.token_to_id("[SEP]")), cls=("[CLS]", tokenizer.token_to_id("[CLS]")))
"CHEF_CHECK": 6, "CHEF_DO": 7, "MOVE_CONTENTS": 8, } k = len(output_vocab) with open("../data/res2idx.json", 'r') as f: for w, i in json.load(f).items(): output_vocab[w] = k k += 1 with open("../data/arg2idx.json", 'r') as f: for w, i in json.load(f).items(): output_vocab[w.replace('-', '_')] = k k += 1 output_vocab = {w: i for i, w in enumerate(output_vocab)} output_tokenizer = Tokenizer(WordLevel(output_vocab, )) output_tokenizer.pre_tokenizer = Whitespace() t = output_tokenizer.encode_batch( ["SERVE MOVE_CONTENTS", "SERVE MOVE_CONTENTS PUT"]) # print (t) csv_file = '../data/seq2seq_4335716.csv' input_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') input_tokenizer.bos_token = input_tokenizer.cls_token input_tokenizer.eos_token = input_tokenizer.sep_token val_data = load_dataset('csv', data_files=csv_file, split='train[90%:]') train_data = load_dataset('csv', data_files=csv_file, split='train[:90%]') # print(val_data) # print(train_data)
def main(): batch_size = 4 vocab_size = 16384 max_source_length = 1024 max_target_length = 1024 num_workers = 3 dataset = nlp.load_dataset("iwslt2017.py", "nl-en") # Train tokenizer tokenizer_filename = "tokenizer.json" if os.path.exists(tokenizer_filename): tokenizer = Tokenizer.from_file(tokenizer_filename) else: data_filename = "whole_data.txt" with open(data_filename, "w") as f: for item in dataset["train"]: f.write(item["source"] + "\n") f.write(item["target"] + "\n\n") tokenizer = CharBPETokenizer() tokenizer.train([data_filename], vocab_size=vocab_size) pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False) tokenizer.add_tokens([pad_token]) tokenizer.save(tokenizer_filename) tokenizer.pad_token_id = vocab_size # Loaders train_dataset = Seq2SeqDataset(tokenizer, dataset["train"], max_source_length, max_target_length) val_dataset = Seq2SeqDataset(tokenizer, dataset["validation"], max_source_length, max_target_length) train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=num_workers, ) val_loader = DataLoader( val_dataset, batch_size=batch_size, collate_fn=val_dataset.collate_fn, num_workers=num_workers, ) # Train model config = BartConfig( vocab_size=vocab_size + 1, # Pad d_model=1024, encoder_ffn_dim=1024, encoder_layers=6, encoder_attention_heads=4, decoder_ffn_dim=1024, decoder_layers=6, decoder_attention_heads=4, ) model = BartForConditionalGeneration(config) translator = Translate(model, tokenizer) trainer = pl.Trainer(gpus=1) trainer.fit(translator, train_loader, val_loader)
import json data_path = Path('/workspace/poetry2021.gt/data/pan_tadeusz5') dataset_path = data_path / 'dataset' vocab_path = data_path / 'vocab.json' tokenizer_tmp_path = data_path / 'tokenizer_tmp' tokenizer_path = data_path / 'tokenizer' text_tokenizer = TextTokenizer(dataset_path) text_tokenizer.load_vocab(vocab_path) vocab = text_tokenizer.vocab vocab_count = len(vocab.keys()) vocab.update({'<|endoftext|>': vocab_count}) tokenizer_tmp = Tokenizer(WordLevel(text_tokenizer.vocab)) tokenizer_tmp.pre_tokenizer = CharDelimiterSplit(' ') tokenizer_tmp.post_processor = BertProcessing( ("<|endoftext|>", tokenizer_tmp.token_to_id("<|endoftext|>")), ("<|endoftext|>", tokenizer_tmp.token_to_id("<|endoftext|>")), ) tokenizer_tmp_path.mkdir(parents=True, exist_ok=True) tokenizer_tmp.save(str(tokenizer_tmp_path / "tokenizer.json")) # Re-create as GPT2 compatible tokenizer class GPT2CompatibleTokenizer(PreTrainedTokenizerFast): def save_vocabulary(self,
from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer #from tokenizers.pre_tokenizers import Whitespace tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) #tokenizer.pre_tokenizer = Whitespace() files = ['./processed/processed_wiki_ko.txt'] tokenizer.train(files, trainer) tokenizer.save("wiki_tokenizer.json")
def test_padding(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # By default it does nothing when encoding single sequence tokenizer.enable_padding() output = tokenizer.encode("my name") assert output.tokens == ["my", "name"] # Can pad to the longest in a batch output = tokenizer.encode_batch(["my name", "my name is john"]) assert all([len(encoding) == 4 for encoding in output]) # Can pad to the specified length otherwise tokenizer.enable_padding(length=4) output = tokenizer.encode("my name") assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] output = tokenizer.encode("my name", "pair") assert output.tokens == ["my", "name", "pair", "[PAD]"] # Can get the params and give them to enable_padding padding = tokenizer.padding tokenizer.enable_padding(**padding)
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, unk_token: Union[str, AddedToken] = "[UNK]", sep_token: Union[str, AddedToken] = "[SEP]", cls_token: Union[str, AddedToken] = "[CLS]", pad_token: Union[str, AddedToken] = "[PAD]", mask_token: Union[str, AddedToken] = "[MASK]", clean_text: bool = True, handle_chinese_chars: bool = True, strip_accents: Optional[bool] = None, lowercase: bool = True, wordpieces_prefix: str = "##", ): if vocab is not None: tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token))) else: tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token))) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) tokenizer.normalizer = BertNormalizer( clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents, lowercase=lowercase, ) tokenizer.pre_tokenizer = BertPreTokenizer() if vocab is not None: sep_token_id = tokenizer.token_to_id(str(sep_token)) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(str(cls_token)) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = BertProcessing( (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)) tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix) parameters = { "model": "BertWordPiece", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "clean_text": clean_text, "handle_chinese_chars": handle_chinese_chars, "strip_accents": strip_accents, "lowercase": lowercase, "wordpieces_prefix": wordpieces_prefix, } super().__init__(tokenizer, parameters)
def test_multiprocessing_with_parallelism(self): tokenizer = Tokenizer(BPE()) multiprocessing_with_parallelism(tokenizer, False) multiprocessing_with_parallelism(tokenizer, True)
def train_tokenizer(args): """[summary] Arguments: args {[dictionary]} -- [arguments객체] """ # Tokenizer train morpheme_func = None if args.tokenizer.pretokenizer_type == "khaiii": api = KhaiiiApi() morpheme_func = api.analyze elif args.tokenizer.pretokenizer_type == "mecab": mecab = Mecab() morpheme_func = mecab.morphs # tokenizer-type", type=str, choices=["bbpe", "cbpe", "wp"], default="bbpe" if args.tokenizer.tokenizer_type == "bbpe": # tokenizer = BytelevelBPETokenizer() tokenizer = Tokenizer(BPE()) # tokenizer.pre_tokenizer = BertPreTokenizer() trainer = BpeTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) elif args.tokenizer.tokenizer_type == "cbpe": tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = CharDelimiterSplit trainer = BpeTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) elif args.tokenizer.tokenizer_type == "wp": tokenizer = Tokenizer(WordPiece()) # tokenizer.pre_tokenizer = Whitespace trainer = WordPieceTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) tokenizer.train_from_iterator(get_pretokenize_generator(morpheme_func)) tokenizer.save(f"../vocab/{args.tokenizer.tokenizer_type}.vocab") test_string = "안녕하세요 이것은 테스트입니다. 구름은 하늘에 떠 있고 우리는 여기있어" output = tokenizer.encode(test_string) print(f"output:{output}") print(f"tokens:{output.tokens}") print(f"ids :{output.ids}") print(f"offset:{output.offsets}") print(f"decode:{tokenizer.decode(output.ids)}") datasets = get_datasets(args.tokenizer.data_path) for line in datasets: print(line) break
In the face of ambiguity, refuse the temptation to guess. There should be one-- and preferably only one --obvious way to do it. Although that way may not be obvious at first unless you're Dutch. Now is better than never. Although never is often better than *right* now. If the implementation is hard to explain, it's a bad idea. If the implementation is easy to explain, it may be a good idea. Namespaces are one honking great idea -- let's do more of those! """.split("\n") if args.type == "gpt2": print("Running GPT-2 tokenizer") tok_p = GPT2Tokenizer.from_pretrained('gpt2') # Create a Tokenizer using BPE tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges)) # Use ByteLevel PreTokenizer tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) # Use ByteLevel Decoder tok_r.decoder = decoders.ByteLevel() elif args.type == "bert": print("Running Bert tokenizer") tok_p = BertTokenizer.from_pretrained(args.vocab) tok_r = Tokenizer( WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)) tok_r.normalizer = BertNormalizer( clean_text=True, handle_chinese_chars=True,
from tokenizers import Tokenizer from tokenizers.decoders import ByteLevel as ByteLevelDecoder from tokenizers.models import BPE from tokenizers.normalizers import Lowercase, NFKC, Sequence from tokenizers.pre_tokenizers import ByteLevel from tokenizers.trainers import BpeTrainer path_data = "../../ml-datasets/wmt14/tokenizer/" path_train_src = "../../ml-datasets/wmt14/train.en" path_train_tgt = "../../ml-datasets/wmt14/train.de" tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ NFKC(), Lowercase() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet(), min_frequency=2, special_tokens=["<pad>", "<s>", "</s>", "<unk>", "<mask>", ]) tokenizer.train(trainer, [path_train_src, path_train_tgt]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) tokenizer.model.save(path_data)
class CustomNormalizer: def normalize(self, normalized: NormalizedString): # Most of these can be replaced by a `Sequence` combining some provided Normalizer, # (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ]) # and it should be the prefered way. That being said, here is an example of the kind # of things that can be done here: normalized.nfkc() normalized.filter(lambda char: not char.isnumeric()) normalized.replace(Regex("\s+"), " ") normalized.lowercase() # This section shows how to attach these custom components to the Tokenizer tok = Tokenizer(BPE()) tok.normalizer = Normalizer.custom(CustomNormalizer()) tok.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer()) tok.decoder = Decoder.custom(CustomDecoder()) input = "永和服装饰品有限公司" print("PreTokenize:", input) print(tok.pre_tokenizer.pre_tokenize_str(input)) # [('永和', (0, 2)), ('服装', (2, 4)), ('饰品', (4, 6)), ('有限公司', (6, 10))] input = "112233" print("PreTokenize:", input) print(tok.pre_tokenizer.pre_tokenize_str(input)) # [('1', (0, 1)), ('122', (1, 4)), ('3', (4, 5)), ('3', (5, 6))] input = "1234 ℌ𝔢𝔩𝔩𝔬 𝔱𝔥𝔢𝔯𝔢 𝓂𝓎 𝒹ℯ𝒶𝓇 𝕕𝕖𝕒𝕣 𝕗𝕣𝕚𝕖𝕟𝕕!"
type=str, help="Path to the output directory, where the files will be saved") parser.add_argument("--name", default="bpe-bytelevel", type=str, help="The name of the output vocab files") args = parser.parse_args() files = glob.glob(args.files) if not files: print(f"File does not exist: {args.files}") exit(1) # Initialize an empty tokenizer tokenizer = Tokenizer(models.BPE.empty()) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel.new() # And then train trainer = trainers.BpeTrainer.new( vocab_size=50000, min_frequency=2, show_progress=True, special_tokens=[ "<s>", "<pad>", "</s>" ], initial_alphabet=pre_tokenizers.ByteLevel.alphabet() ) tokenizer.train(trainer, files)