def train_tokenizer(langs, dataset, vocab_size): """Train a tokenizer on given list of languages. Reserves a special token for each language which is [LANG] where LANG is the language tag. These are assigned to tokens 5, 6, ..., len(langs) + 4. """ # Byte-pair encoding tokenizer = Tokenizer(BPE(unk_token='[UNK]')) # trainer lang_tokens = ['[' + lang + ']' for lang in langs] special_tokens = ['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'] + lang_tokens trainer = BpeTrainer( special_tokens=special_tokens, vocab_size=vocab_size) # normalise and pre tokenize tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() # create iterator and train iterator = _MultilingualIterator(dataset, langs) tokenizer.train_from_iterator(iterator, trainer) # post process start/end tokens tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) return tokenizer
def train_tokenizer(lang, dataset, vocab_size): # Byte-pair encoding tokenizer = Tokenizer(BPE(unk_token='[UNK]')) # trainer trainer = BpeTrainer( special_tokens=['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'], vocab_size=vocab_size) # pre tokenizer with whitespace tokenizer.pre_tokenizer = Whitespace() # train tokenizer.train_from_iterator(dataset[lang], trainer) # post process start/end tokens tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) return tokenizer
def train_tokenizer(args): """[summary] Arguments: args {[dictionary]} -- [arguments객체] """ # Tokenizer train morpheme_func = None if args.tokenizer.pretokenizer_type == "khaiii": api = KhaiiiApi() morpheme_func = api.analyze elif args.tokenizer.pretokenizer_type == "mecab": mecab = Mecab() morpheme_func = mecab.morphs # tokenizer-type", type=str, choices=["bbpe", "cbpe", "wp"], default="bbpe" if args.tokenizer.tokenizer_type == "bbpe": # tokenizer = BytelevelBPETokenizer() tokenizer = Tokenizer(BPE()) # tokenizer.pre_tokenizer = BertPreTokenizer() trainer = BpeTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) elif args.tokenizer.tokenizer_type == "cbpe": tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = CharDelimiterSplit trainer = BpeTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) elif args.tokenizer.tokenizer_type == "wp": tokenizer = Tokenizer(WordPiece()) # tokenizer.pre_tokenizer = Whitespace trainer = WordPieceTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) tokenizer.train_from_iterator(get_pretokenize_generator(morpheme_func)) tokenizer.save(f"../vocab/{args.tokenizer.tokenizer_type}.vocab") test_string = "안녕하세요 이것은 테스트입니다. 구름은 하늘에 떠 있고 우리는 여기있어" output = tokenizer.encode(test_string) print(f"output:{output}") print(f"tokens:{output.tokens}") print(f"ids :{output.ids}") print(f"offset:{output.offsets}") print(f"decode:{tokenizer.decode(output.ids)}") datasets = get_datasets(args.tokenizer.data_path) for line in datasets: print(line) break
def train_tokenizer(input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000): """ Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path` :param input_dir: input directory containing jsonl files :param save_path: path to save tokenizer to :param tokenizer_type: type of tokenizer to train. :param vocab_size: int, size of tokenizer's vocab :return: """ if tokenizer_type == "BPE": model = models.BPE() else: raise NotImplementedError( f'Tokenizer type {tokenizer_type} not implemented') tokenizer = Tokenizer(model) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) tokenizer.normalizer = NFKC() # And then train trainer = trainers.BpeTrainer( vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]) tokenizer.train_from_iterator(json_iterator(input_dir), trainer) # And Save it tokenizer.save(save_path, pretty=True) print(f'Tokenizer saved at {save_path}')
def get_daily_dialog_tokenizer(tokenizer_location=None): ''' Get the daily dialog tokenizer. Trains a new one if no location is provided :param tokenizer_location: Json containing information about the tokenizer. :return: ''' if tokenizer_location: tokenizer = Tokenizer.from_file(tokenizer_location, ) tokenizer.enable_padding() return tokenizer else: dataset_train = datasets.load_dataset("daily_dialog", split="train", ) utterances = [special_tokens["sep_token"].join(dialogue["dialog"]) for dialogue in dataset_train] trainer = WordPieceTrainer( vocab_size = 2048, special_tokens = token_utils.special_tokens.values() ) custom_tokenizer = Tokenizer(WordPiece(unk_token=special_tokens["unk_token"], )) custom_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) custom_tokenizer.pre_tokenizer = Whitespace() custom_tokenizer.train_from_iterator(utterances, trainer, ) custom_tokenizer.enable_padding() # Write every dialogue to file location = './daily_dialog/' custom_tokenizer.save(location + "tokenizer.json") return custom_tokenizer
def tokenize(dt, df): from tokenizers import Tokenizer from tokenizers.models import WordPiece from tokenizers.pre_tokenizers import Whitespace from tokenizers import normalizers from tokenizers.normalizers import NFD, StripAccents from tokenizers.processors import TemplateProcessing from tokenizers.trainers import WordPieceTrainer #print(df.head()) #print(df.query_text.head()) #print(df.query_text.to_list()) #exit(0) data_source = get_data_source(dt) token_file = Path(data_dir, data_source, 'tokenizer.json') vocab_file = Path(data_dir, data_source, 'vocab.txt') corpus_file = Path(data_dir, data_source, 'corpus.txt') if vocab_file.is_file() and corpus_file.is_file(): print("corpus and token files already generated") return 0 bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer.normalizer = normalizers.Sequence([NFD(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) trainer = WordPieceTrainer( vocab_size=25000, min_frequency=3, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) #print(df.query_text.to_list()) bert_tokenizer.train_from_iterator(df.query_text.to_list(), trainer) bert_tokenizer.save(str(token_file)) #bert_tokenizer.save_model(directory=data_dir,name='tokenizer') df['range_idx'] = range(0, df.shape[0]) df['mean_rank_group'] = df.groupby( ['session_id'], sort=False)['range_idx'].transform(np.mean) df['separate_column'] = df['range_idx'] < df['mean_rank_group'] df = df.groupby(['session_id', 'separate_column'], as_index=False, sort=False)['query_text'].agg( ' '.join).drop(columns='separate_column') #df = df.groupby('session_id').agg({'query_text':' '.join}).reset_index() df.query_text.to_csv(corpus_file, header=False, index=False) with open(token_file) as token_f: jdata = json.load(token_f) with open(vocab_file, "w") as fd: for k in jdata['model']['vocab'].keys(): print(k, file=fd)
def main(args): # copy from https://github.com/xinjli/allosaurus ipa0 = [ 'I', 'a', 'aː', 'ã', 'ă', 'b', 'bʲ', 'bʲj', 'bʷ', 'bʼ', 'bː', 'b̞', 'b̤', 'b̥', 'c', 'd', 'dʒ', 'dʲ', 'dː', 'd̚', 'd̥', 'd̪', 'd̯', 'd͡z', 'd͡ʑ', 'd͡ʒ', 'd͡ʒː', 'd͡ʒ̤', 'e', 'eː', 'e̞', 'f', 'fʲ', 'fʷ', 'fː', 'g', 'gʲ', 'gʲj', 'gʷ', 'gː', 'h', 'hʷ', 'i', 'ij', 'iː', 'i̞', 'i̥', 'i̯', 'j', 'k', 'kx', 'kʰ', 'kʲ', 'kʲj', 'kʷ', 'kʷʼ', 'kʼ', 'kː', 'k̟ʲ', 'k̟̚', 'k͡p̚', 'l', 'lʲ', 'lː', 'l̪', 'm', 'mʲ', 'mʲj', 'mʷ', 'mː', 'n', 'nj', 'nʲ', 'nː', 'n̪', 'n̺', 'o', 'oː', 'o̞', 'o̥', 'p', 'pf', 'pʰ', 'pʲ', 'pʲj', 'pʷ', 'pʷʼ', 'pʼ', 'pː', 'p̚', 'q', 'r', 'rː', 's', 'sʲ', 'sʼ', 'sː', 's̪', 't', 'ts', 'tsʰ', 'tɕ', 'tɕʰ', 'tʂ', 'tʂʰ', 'tʃ', 'tʰ', 'tʲ', 'tʷʼ', 'tʼ', 'tː', 't̚', 't̪', 't̪ʰ', 't̪̚', 't͡s', 't͡sʼ', 't͡ɕ', 't͡ɬ', 't͡ʃ', 't͡ʃʲ', 't͡ʃʼ', 't͡ʃː', 'u', 'uə', 'uː', 'u͡w', 'v', 'vʲ', 'vʷ', 'vː', 'v̞', 'v̞ʲ', 'w', 'x', 'x̟ʲ', 'y', 'z', 'zj', 'zʲ', 'z̪', 'ä', 'æ', 'ç', 'çj', 'ð', 'ø', 'ŋ', 'ŋ̟', 'ŋ͡m', 'œ', 'œ̃', 'ɐ', 'ɐ̞', 'ɑ', 'ɑ̱', 'ɒ', 'ɓ', 'ɔ', 'ɔ̃', 'ɕ', 'ɕː', 'ɖ̤', 'ɗ', 'ə', 'ɛ', 'ɛ̃', 'ɟ', 'ɡ', 'ɡʲ', 'ɡ̤', 'ɡ̥', 'ɣ', 'ɣj', 'ɤ', 'ɤɐ̞', 'ɤ̆', 'ɥ', 'ɦ', 'ɨ', 'ɪ', 'ɫ', 'ɯ', 'ɯ̟', 'ɯ̥', 'ɰ', 'ɱ', 'ɲ', 'ɳ', 'ɴ', 'ɵ', 'ɸ', 'ɹ', 'ɹ̩', 'ɻ', 'ɻ̩', 'ɽ', 'ɾ', 'ɾj', 'ɾʲ', 'ɾ̠', 'ʀ', 'ʁ', 'ʁ̝', 'ʂ', 'ʃ', 'ʃʲː', 'ʃ͡ɣ', 'ʈ', 'ʉ̞', 'ʊ', 'ʋ', 'ʋʲ', 'ʌ', 'ʎ', 'ʏ', 'ʐ', 'ʑ', 'ʒ', 'ʒ͡ɣ', 'ʔ', 'ʝ', 'ː', 'β', 'β̞', 'θ', 'χ', 'ә', 'ḁ' ] ipa1, ipa2, ipa3 = ipa0.copy(), ipa0.copy(), ipa0.copy() random.shuffle(ipa1) random.shuffle(ipa2) random.shuffle(ipa3) # randomly joined to form training data passage0 = ' '.join(ipa0) passage1 = ' '.join(ipa1) passage2 = ' '.join(ipa2) passage3 = ' '.join(ipa3) data = [passage0, passage1, passage2, passage3] # setup tokenizer = Tokenizer(WordLevel(unk_token="<unk>")) # trainer = WordLevelTrainer(vocab_size=300, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) trainer = WordLevelTrainer( vocab_size=300, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) tokenizer.pre_tokenizer = Whitespace() # train the tokenizer tokenizer.train_from_iterator(data, trainer=trainer) tokenizer.save(args.outdir + '/ipa_tokenizer.json')
def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 8000) -> Tokenizer: bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) trainer = WordPieceTrainer( vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] ) bert_tokenizer.train_from_iterator(sentences, trainer=trainer) if serialize_path: bert_tokenizer.save(serialize_path) return bert_tokenizer
class HuggingFaceWordLevelTokenizer(TokenizerBase): def __init__(self, **kwargs): super().__init__(**kwargs) from tokenizers import Tokenizer, models, normalizers, pre_tokenizers self.tokenizer = Tokenizer( models.WordLevel(unk_token=self.unknown_token)) self.tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() if self.lower: self.tokenizer.normalizer = normalizers.Lowercase() def fit(self, *, texts=None, text_batch_iter=None, max_tokens=None): from tokenizers import trainers trainer = trainers.WordLevelTrainer(vocab_size=self.max_vocab_size, special_tokens=list( self.special_tokens)) self.tokenizer.train_from_iterator(text_batch_iter, trainer=trainer) self.token_to_id = self.tokenizer.get_vocab() self.id_to_token = { token_id: token for token, token_id in self.token_to_id.items() } def encode(self, texts): id_seqs = self.tokenizer.encode_batch(texts) id_seqs = [id_seq.ids for id_seq in id_seqs] return self._post_process( id_seqs, pad_id=self.token_to_id[self.pad_token] if self.pad_token else None, sos_id=self.token_to_id[self.sos_token] if self.sos_token else None, eos_id=self.token_to_id[self.eos_token] if self.eos_token else None, ) def decode(self, id_seqs): self.tokenizer.decode_batch(id_seqs)
def build_new_vocab(): tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = Whitespace() # files = [f"/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-{split}-factoid-7b.json" for split in ["train_split", "dev"]] files = "/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-train-factoid-7b.json" with open(files) as f: file = json.load(f) contexts = [] for question in file['data']: for paragraph in question['paragraphs']: contexts.append(paragraph['context']) tokenizer.train_from_iterator(contexts, trainer) additional_vocab = [k for k, v in tokenizer.get_vocab().items()] tokenizer.save("tokenizer/tokenizer-bioasq.json") return additional_vocab
class MLMPreprocessor: def __init__( self, load_from: str = None, vocab_size: int = 10000, max_example_len: int = 128, batch_size: int = 16, num_stopwords: int = 250, mask_output_len: int = 4, ): self.char_dict: Dict[str, int] = {} self.char_rev: Dict[int, str] = {} self.token_dict: Dict[str, int] = {} self.token_rev: Dict[str, int] = {} self.vocab_size = vocab_size self.max_example_len = max_example_len self.batch_size = batch_size self.num_stopwords = num_stopwords self.mask_output_len = mask_output_len self.tokenizer_fit = False self.tokenizer = Tokenizer(BPE(unk_token="[UNK]")) self.tokenizer.pre_tokenizer = Whitespace() self.tokenizer.normalizer = Sequence( [NFD(), Lowercase(), StripAccents()]) self.tok_trainer = BpeTrainer(special_tokens=["[UNK]", "[MASK]"], vocab_size=self.vocab_size) if load_from: self._load(load_from) def fit(self, data: List[str], min_char_freq: int = 1, progbar: bool = True): """ Create a character-level dictionary based on a list of strings """ if not self.tokenizer_fit: self.tokenizer.train_from_iterator(data, trainer=self.tok_trainer) char_counter: Counter = Counter() token_counter: Counter = Counter() iterator_: Iterable = data if progbar: iterator_ = tqdm(data) for example in iterator_: chars = Counter(example) for char, char_count in chars.items(): try: char_counter[char] += char_count except KeyError: char_counter[char] = char_count counts = [k for k, v in char_counter.items() if v >= min_char_freq] self.char_rev = {0: "", 1: "?", 2: "?", 3: ""} for c in sorted(counts): n = len(self.char_rev) self.char_rev[n] = c self.char_dict[c] = n def scrub(self, token): """ Normalize a token by removing punctuation. Used to build a vocabulary and to choose tokens to mask during pretraining. """ token = token.lower() while len(token) > 0 and token[0] in PUNCT: token = token[1:] while len(token) > 0 and token[-1] in PUNCT: token = token[:-1] token = re.sub("\d", "#", token) return token def tokenize(self, string_to_tokenize: str) -> Encoding: string_to_tokenize = string_to_tokenize return self.tokenizer.encode(string_to_tokenize) def string_to_array(self, string_in, length, padding_pre=False): # truncate if padding_pre: s = string_in[-length:] else: s = string_in[:length] # map char -> int and left-zero-pad mapped = np.ones((len(s))) for n, char in enumerate(s): try: mapped[n] = self.char_dict[char] except KeyError: pass # mapped = [self.char_dict.get(x, 1) for x in s] if padding_pre: r = np.pad(mapped, (length - len(s), 0), "constant", constant_values=(0, 0)) else: r = np.pad(mapped, (0, length - len(s)), "constant", constant_values=(0, 0)) return r def string_to_example(self, example, return_example=False, allow_null_examples=False): # simple tokenization sp = [tok.strip() for tok in example.split(" ") if tok.strip() != ""] # normalize to see what we can replace normed = [self.scrub(tok) for tok in sp] # see which tokens are in the vocabulary replaceable_tokens = [(t, i) for i, t in enumerate(normed) if t in self.token_dict] assert ( len(sp) >= 2 ), "minimum length of an example is 2 tokens (white-space delimited)" assert ( len(replaceable_tokens) > 0 or allow_null_examples ), f"called string_to_example on string with no tokens that are in the vocabulary and allow_null_examples=True\n{example}" if len(replaceable_tokens) == 0 and allow_null_examples: return None # choose a token to replace rep_ind = np.random.randint(0, len(replaceable_tokens)) rep, rep_ind = replaceable_tokens[rep_ind] # get the index of the token for the output mask_ind = self.token_dict[rep] label_array = np.zeros(self.vocab_size) label_array[mask_ind] = 1 # piece the masked input back together left = " ".join(sp[:rep_ind]) right = " ".join(sp[rep_ind + 1:]) if len(sp) > rep_ind + 1 else "" left_len = len(left) right_len = len(right) thresh = (self.max_example_len - self.mask_output_len - 2) // 2 right_diff = thresh - left_len if left_len < thresh else 0 left_diff = thresh - right_len if right_len < thresh else 0 left_sub = left[-(thresh + left_diff):] right_sub = right[:(thresh + right_diff)] combo = left_sub + " " + "?" * 4 + " " + right_sub encoded = self.string_to_array(combo, self.max_example_len) ret_val = [encoded, label_array] if return_example: ret_val += [combo] return ret_val def strings_to_examples(self, strings): enc = np.zeros((len(strings), self.max_example_len)) labels = np.zeros((len(strings), self.vocab_size)) for n in range(len(strings)): enc[n], labels[n] = self.string_to_example(strings[n]) return [enc, labels] def examples_generator(self, strings): assert len(strings) >= 1 while True: ind = 0 while ind < len(strings): yield self.strings_to_examples(strings[ind:ind + self.batch_size]) ind += self.batch_size def save(self, path): """ Write a Preprocessor object to a .JSON config """ config = { "char_rev": self.char_rev, "char_dict": self.char_dict, "max_example_len": self.max_example_len, } with open(os.path.join(path, "cclm_config.json"), "w") as f: json.dump(config, f) def _load(self, path): """ Load a Preprocessor object from disk """ with open(path, "rb") as f: result = json.load(f) for key, value in result.items(): setattr(self, key, value)
TRAIN_DATA_PATH = 'data/data_fusion_train.parquet' OUTPUT_PATH = 'data/tokenizers/' # Prepare data train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name']) item_names = train.item_name.drop_duplicates().tolist() # WordPiece tokenizer tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=70000) tokenizer.train_from_iterator(item_names, trainer) tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json')) # BPE tokenizer tokenizer = Tokenizer(BPE(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=60000) tokenizer.train_from_iterator(item_names, trainer) tokenizer.save(os.path.join(OUTPUT_PATH, 'bpe_60k.json')) # Unigram tokenizer tokenizer = Tokenizer(Unigram())
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True): """ if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining) else: load the tokenizer if it exists """ assert dataset in VALID_DATASETS assert style in VALID_TOKENIZATIONS tpath_expected = default_tpath(dataset, style) train = True if not force_retrain and os.path.isfile(tpath_expected): tokenizer = Tokenizer.from_file(tpath_expected) train = False else: print('%s tokenizer file does not exist; training new tokenizer' % tpath_expected) if train: # load data associated with one of the valid datasets (from /data/ directory) datafiles = load_dataset(dataset) # Steps for each algo (e.g. BPE): # - init Tokenizer using algo # - specify algo specific trainer # - specify any pre-processing of text (will affect decoding) # see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders # - different training calls if its the arxiv dataset or wikitext # see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/ if style == 'BPE': tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = ByteLevel() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.ByteLevel() else: assert style == 'WordLevel' tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) trainer = WordLevelTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = Whitespace() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.WordPiece( ) # WordPiece seems to work (adds back spaces) # Save to tokenizers directory tokenizer.save(tpath_expected) # Generate vocab object based on tokenizer.decoder() method # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere... # Features we need to match: # from torchtext.legacy.vocab import Vocab as RetiredVocab # ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...) # data = [torch.tensor([vocab[token] for token in tokenizer(item)], # dtype=torch.long) for item in raw_text_iter] # tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long) # running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])]) # unk_index = vocab.unk_index vocab = None return tokenizer, vocab
import datasets from tokenizers import normalizers, pre_tokenizers, Tokenizer, models, trainers # Build a tokenizer bpe_tokenizer = Tokenizer(models.BPE()) bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() bpe_tokenizer.normalizer = normalizers.Lowercase() # Initialize a dataset dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1") # Build an iterator over this dataset def batch_iterator(): batch_length = 1000 for i in range(0, len(dataset["train"]), batch_length): yield dataset["train"][i:i + batch_length]["text"] # And finally train bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset["train"]))
def train_custom_tokenizer(dataset, token_model, tknzr_file, vocab_size, vocab=None, pretrain_fast=False, max_input_chars_per_word=None, eos_token=None, bos_token=None, pad_token=None, mask_token=None, unk_token=None): """ Building a Tokenizer using HuggingFace library. The pipeline seems to be: - Model : algorithm that tokenizes, it is a mandatory component. There are only 4 models implemented (BPE, Unigram, WordLevel, WordPiece) - Normalizer : some preprocessing that could happen before, but doesn't necessarily have to - Pre-Tokenizer : splitting the input according to some rules - Post-Processing : needing to add some tokens/input after (mostly seems to be eos, bos tokens) - Decoder : certain previous pipeline steps need to be reversed for proper decoding - Trainer : The corresponding training algorithm for the model Note : Some pre-processing might need to happen beforehand in previous functions (might be easier using pandas before) Input token_model (str) : algorithm to use for tokenization dataset (class) : a python iterator that goes through the data to be used for training token_dir (str) : directory with tokenizers vocab_size (int) : size of the vocabulary to use tokenFilename (str) : filename of particular token we want to train. Will overwrite previously save files. vocab (list of str) : models other than BPE can use non-mandatory vocab as input max_input_chars_per_word : used for WordPiece Output tokenizer : huggingFace Tokenizer object, our fully trainer tokenizer """ special_token_lst = [ pad_token, bos_token, eos_token, mask_token, unk_token ] # NFKC normalizer_lst = [] pre_tokenizer_lst = [Whitespace, ByteLevel] decoder_lst = [] bos_idx = special_token_lst.index(bos_token) eos_idx = special_token_lst.index(eos_token) if token_model == 'BPE': model = BPE(unk_token=unk_token) Trainer = BpeTrainer elif token_model == 'Unigram': model = Unigram(vocab=vocab) Trainer = UnigramTrainer elif token_model == 'WordLevel': model = WordLevel(unk_token=unk_token, vocab=vocab) Trainer = WordLevelTrainer elif token_model == 'WordPiece': model = WordPiece(unk_token=unk_token, vocab=vocab, max_input_chars_per_word=max_input_chars_per_word) Trainer = WordPieceTrainer else: error_msg = f'Error: token_model ({token_model}) not an algorithm in%s' \ % VALID_TOKENIZATIONS raise SystemExit(error_msg) # instantiation tokenizer = Tokenizer(model) # Select a tokenization trainer if vocab_size is None: trainer = Trainer(show_progress=True, special_tokens=special_token_lst) else: trainer = Trainer(vocab_size=vocab_size, show_progress=True, special_tokens=special_token_lst) # Set the normalizer tokenizer.normalizer = normalizers.Sequence( [fcn() for fcn in normalizer_lst]) # Set the pre-tokenizer tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [fcn() for fcn in pre_tokenizer_lst]) # Set the post-processing tokenizer.post_processor = processors.TemplateProcessing( single=bos_token + " $A " + eos_token, special_tokens=[(bos_token, bos_idx), (eos_token, eos_idx)], # pair=bos_token+" $A "+eos_token" $B:1 "+eos_token+":1", ) # Set the decoder if ByteLevel in pre_tokenizer_lst: tokenizer.decoder = decoders.ByteLevel() if Metaspace in pre_tokenizer_lst: tokenizer.decoder = decoders.Metaspace() if token_model == 'WordPiece': tokenizer.decoder = decoders.WordPiece() # creating iterator def batch_iterator(): for i in np.arange(0, len(dataset)): yield dataset[i] # train call tokenizer.train_from_iterator(trainer=trainer, iterator=batch_iterator(), length=len(dataset)) if Path(tknzr_file).exists(): print(f"Warning : overwriting previously save tokenizer with\ same filename ( {tknzr_file} ).") tokenizer.save(tknzr_file) if pretrain_fast: tokenizer = PreTrainedTokenizerFast(tokenizer_file=tknzr_file) else: tokenizer = PreTrainedTokenizer(tokenizer_file=tknzr_file) tokenizer.pad_token = pad_token tokenizer.mask_token = mask_token return tokenizer
bert_tokenizer.train_from_iterator(sentences, trainer=trainer) if serialize_path: bert_tokenizer.save(serialize_path) return bert_tokenizer ids = bert_tokenizer.encode(sentences[10]).ids bert_tokenizer.decode(ids) from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers tokenizer = Tokenizer(models.Unigram()) tokenizer.normalizer = normalizers.NFKC() tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoders = decoders.ByteLevel() trainer = trainers.UnigramTrainer( vocab_size=20000, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), special_tokens=["<PAD>", "<BOS>", "<EOS>"], ) tokenizer.train_from_iterator(sentences, trainer=trainer) tokenizer.encode(sentences[4]).ids tokenizer.decode(tokenizer.encode(sentences[4]).ids) tokenizer.save('bert_out/test2') tokenizer.save_pretrained('bert_out/test')
parser.add_argument('--languages', help='dataset languages to tokenize', type=str, required=True) parser.add_argument('--tokenizer-out', help='tokenizer output file', type=str, required=True) parser.add_argument('--special-tokens', type=str, default="[UNK],[SEP],[PAD],[MASK],[ECHO],[TRANSLATE]") args = parser.parse_args() # translation_dataset = load_dataset(args.dataset, args.languages) # translation_datase t.set_format(columns='translation') translation_dataset = NewsCommentaryTranslationDataset() tokenizer_file = args.tokenizer_out special_tokens = args.special_tokens.split(",") tokenizer = Tokenizer(BPE(unk_token="[UNK]")) tokenizer.pre_tokenizer = Whitespace() trainer = BpeTrainer(special_tokens=special_tokens) all_translation_sentences = map( lambda x: [x['translation'][lang] for lang in x['translation'].keys()], translation_dataset) tokenizer.train_from_iterator(all_translation_sentences, trainer=trainer) tokenizer.save(tokenizer_file)