def __init__( self, load_from: str = None, vocab_size: int = 10000, max_example_len: int = 128, batch_size: int = 16, num_stopwords: int = 250, mask_output_len: int = 4, ): self.char_dict: Dict[str, int] = {} self.char_rev: Dict[int, str] = {} self.token_dict: Dict[str, int] = {} self.token_rev: Dict[str, int] = {} self.vocab_size = vocab_size self.max_example_len = max_example_len self.batch_size = batch_size self.num_stopwords = num_stopwords self.mask_output_len = mask_output_len self.tokenizer_fit = False self.tokenizer = Tokenizer(BPE(unk_token="[UNK]")) self.tokenizer.pre_tokenizer = Whitespace() self.tokenizer.normalizer = Sequence( [NFD(), Lowercase(), StripAccents()]) self.tok_trainer = BpeTrainer(special_tokens=["[UNK]", "[MASK]"], vocab_size=self.vocab_size) if load_from: self._load(load_from)
def get_daily_dialog_tokenizer(tokenizer_location=None): ''' Get the daily dialog tokenizer. Trains a new one if no location is provided :param tokenizer_location: Json containing information about the tokenizer. :return: ''' if tokenizer_location: tokenizer = Tokenizer.from_file(tokenizer_location, ) tokenizer.enable_padding() return tokenizer else: dataset_train = datasets.load_dataset("daily_dialog", split="train", ) utterances = [special_tokens["sep_token"].join(dialogue["dialog"]) for dialogue in dataset_train] trainer = WordPieceTrainer( vocab_size = 2048, special_tokens = token_utils.special_tokens.values() ) custom_tokenizer = Tokenizer(WordPiece(unk_token=special_tokens["unk_token"], )) custom_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) custom_tokenizer.pre_tokenizer = Whitespace() custom_tokenizer.train_from_iterator(utterances, trainer, ) custom_tokenizer.enable_padding() # Write every dialogue to file location = './daily_dialog/' custom_tokenizer.save(location + "tokenizer.json") return custom_tokenizer
def train_tokenizer(langs, dataset, vocab_size): """Train a tokenizer on given list of languages. Reserves a special token for each language which is [LANG] where LANG is the language tag. These are assigned to tokens 5, 6, ..., len(langs) + 4. """ # Byte-pair encoding tokenizer = Tokenizer(BPE(unk_token='[UNK]')) # trainer lang_tokens = ['[' + lang + ']' for lang in langs] special_tokens = ['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'] + lang_tokens trainer = BpeTrainer( special_tokens=special_tokens, vocab_size=vocab_size) # normalise and pre tokenize tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() # create iterator and train iterator = _MultilingualIterator(dataset, langs) tokenizer.train_from_iterator(iterator, trainer) # post process start/end tokens tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) return tokenizer
def train(): """Source: https://huggingface.co/docs/tokenizers/pipeline""" base = os.environ['DATA_ROOT'] corpus_path = base + 'MimicIII/Encounters/Text/' bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) # input to tokenizer.encode() goes through this pipeline: # normalization, pre-tokenization, model, post-processing bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[("[CLS]", 1), ("[SEP]", 2)]) files = [str(file) for file in Path(corpus_path).glob('*.txt')] trainer = WordPieceTrainer( vocab_size=30522, show_progress=True, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) bert_tokenizer.train(files, trainer) os.mkdir('./Tokenizer') bert_tokenizer.save("Tokenizer/tokenizer.json")
def tokenize(dt, df): from tokenizers import Tokenizer from tokenizers.models import WordPiece from tokenizers.pre_tokenizers import Whitespace from tokenizers import normalizers from tokenizers.normalizers import NFD, StripAccents from tokenizers.processors import TemplateProcessing from tokenizers.trainers import WordPieceTrainer #print(df.head()) #print(df.query_text.head()) #print(df.query_text.to_list()) #exit(0) data_source = get_data_source(dt) token_file = Path(data_dir, data_source, 'tokenizer.json') vocab_file = Path(data_dir, data_source, 'vocab.txt') corpus_file = Path(data_dir, data_source, 'corpus.txt') if vocab_file.is_file() and corpus_file.is_file(): print("corpus and token files already generated") return 0 bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer.normalizer = normalizers.Sequence([NFD(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) trainer = WordPieceTrainer( vocab_size=25000, min_frequency=3, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) #print(df.query_text.to_list()) bert_tokenizer.train_from_iterator(df.query_text.to_list(), trainer) bert_tokenizer.save(str(token_file)) #bert_tokenizer.save_model(directory=data_dir,name='tokenizer') df['range_idx'] = range(0, df.shape[0]) df['mean_rank_group'] = df.groupby( ['session_id'], sort=False)['range_idx'].transform(np.mean) df['separate_column'] = df['range_idx'] < df['mean_rank_group'] df = df.groupby(['session_id', 'separate_column'], as_index=False, sort=False)['query_text'].agg( ' '.join).drop(columns='separate_column') #df = df.groupby('session_id').agg({'query_text':' '.join}).reset_index() df.query_text.to_csv(corpus_file, header=False, index=False) with open(token_file) as token_f: jdata = json.load(token_f) with open(vocab_file, "w") as fd: for k in jdata['model']['vocab'].keys(): print(k, file=fd)
def tokenizer_pipeline(): """ specific pipeline for Cebuano Corpus tokenization - Uses a Byte pair encoding (BPE) tokenizer """ tokenizer = Tokenizer(BPE()) # string normalization tokenizer.normalizer = Sequence([NFD(), StripAccents(), Lowercase()]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() return tokenizer
def _prepare_pipeline(self): self.tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) self.tokenizer.pre_tokenizer = Whitespace() self.tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) self.tokenizer.enable_padding( pad_id=self.__class__.SPECIAL_TOKENS.index("[PAD]"), pad_token="[PAD]")
def main(args): # from tokenizers import BertWordPieceTokenizer from tokenizers import Tokenizer from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece()) # bert_tokenizer = Tokenizer(MBartTokenizer()) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() # from tokenizers.processors import TemplateProcessing # # bert_tokenizer.post_processor = TemplateProcessing( # single="[CLS] $A [SEP]", # pair="[CLS] $A [SEP] $B:1 [SEP]:1", # special_tokens=[ # ("[CLS]", 1), # ("[SEP]", 2), # ], # ) from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=10000, special_tokens=["[UNK]", "[CLS]", "[PAD]", "[MASK]"] # "[SEP]", "[PAD]", "[MASK]"] ) files = glob.glob(args.text_raw_files_pattern) bert_tokenizer.train(trainer, files) os.makedirs(args.output_dir, exist_ok=True) model_files = bert_tokenizer.model.save(args.output_dir, "bert-tokenizer-kr") bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]") bert_tokenizer.save(os.path.join(args.output_dir, "bert-tokenizer-kr.json"))
def configure_tokenizers(self, padding, truncation, max_length, lower): # Settings pad_length = None if padding in {True, "longest"}: pass elif padding in {"max_length"}: pad_length = max_length elif padding in {False, "do_not_pad"}: pass else: raise ValueError("Unknown padding type") # SRC tokenizer tok_normalizers = [NFD(), Strip()] if lower: tok_normalizers += [Lowercase()] self.tokenizer = Tokenizer(tok_model()) # unk_token=... not working self.tokenizer.add_special_tokens(self.special_tokens) self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [WhitespaceSplit()]) self.tokenizer.normalizer = normalizers.Sequence( tok_normalizers) # StripAccents requires NFD self.tokenizer.decoder = tok_decoder() # Define template (Needed for the sos/eos tokens) basic_template = TemplateProcessing( single=f"{self.SOS_WORD} $A {self.EOS_WORD}", pair= f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}", special_tokens=[ (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)), (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD)) ], ) self.tokenizer.post_processor = basic_template if padding: self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id( self.PAD_WORD), pad_token=self.PAD_WORD, length=pad_length) if truncation: self.tokenizer.enable_truncation(max_length, stride=0, strategy='longest_first')
def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 8000) -> Tokenizer: bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) trainer = WordPieceTrainer( vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] ) bert_tokenizer.train_from_iterator(sentences, trainer=trainer) if serialize_path: bert_tokenizer.save(serialize_path) return bert_tokenizer
def train_wordpiece_bert(): """ Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html """ from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() from tokenizers.processors import TemplateProcessing bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) bert_tokenizer.decoder = decoders.WordPiece() from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) files = [ DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a for a in ["test", "train", "valid"] ] bert_tokenizer.train(files, trainer) bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json') return bert_tokenizer
from itertools import product from tokenizers import Tokenizer, Regex from tokenizers.models import WordLevel from tokenizers.normalizers import NFD from tokenizers.trainers import WordLevelTrainer from tokenizers.pre_tokenizers import Split, WhitespaceSplit ## Loop which creates and loads the tokenizer def stackTraceTokenizer(tokens:tuple, events:tuple, vocab_size=2_000, min_freq=3): for norm, event in product(tokens, events): print(norm, event) tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) tokenizer.normalizer = NFD() if norm == 'white': tokenizer.pre_tokenizer = WhitespaceSplit() else: tokenizer.pre_tokenizer = Split(pattern=Regex("[A-Z]+[a-z0-9]+|[.A-Z]+|[a-z0-9]+"), behavior='isolated') trainer = WordLevelTrainer(vocab_size=vocab_size, min_frequency=min_freq, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(["vocab-{}.txt".format(event)], trainer) print(f"Trained tokenizer for {norm}:{event}") yield tokenizer, event, norm
from tokenizers import Tokenizer from tokenizers.models import WordPiece from tokenizers.trainers import WordPieceTrainer from tokenizers.pre_tokenizers import Whitespace from tokenizers.normalizers import Sequence, NFD, Lowercase, Strip def train(dataset_path, output_dir='data/tokenizer/', vocab_size=30_000, min_frequency=3): trainer = WordPieceTrainer(vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]']) tokenizer = Tokenizer(WordPiece()) tokenizer.pre_tokenizer = Whitespace() tokenizer.normalizer = Sequence([NFD(), Lowercase(), Strip()]) files = [dataset_path] tokenizer.train(trainer, files) files = tokenizer.model.save(output_dir) tokenizer.model = WordPiece.from_file(*files, unk_token='[UNK]') tokenizer.save(f'{output_dir}tokenizer.json') if __name__ == '__main__': fire.Fire(train)