def get_recurrent_tokenizer(vocab, max_context_tokens, unk_token, pad_token, device="cpu"): """ Return a tokenizer to be used with recurrent-based models """ question_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token)) question_tokenizer.normalizer = Sequence( [StripAccents(), Lowercase(), Strip()]) question_tokenizer.pre_tokenizer = PreSequence( [Whitespace(), Punctuation()]) question_tokenizer.enable_padding(direction="right", pad_id=vocab[pad_token], pad_type_id=1, pad_token=pad_token) context_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token)) context_tokenizer.normalizer = Sequence( [StripAccents(), Lowercase(), Strip()]) context_tokenizer.pre_tokenizer = PreSequence( [Whitespace(), Punctuation()]) context_tokenizer.enable_padding( direction="right", pad_id=vocab[pad_token], pad_type_id=1, pad_token=pad_token, ) context_tokenizer.enable_truncation(max_context_tokens) return RecurrentSquadTokenizer(question_tokenizer, context_tokenizer, device=device)
def __init__(self, glove_fp=None, embedding=None): assert glove_fp or embedding super().__init__() self.pre_token = Whitespace() # i.e. tokenize on whitespace + punct if glove_fp: self.model = GloveEmbedding(location=glove_fp) else: self.model = embedding
def __init__( self, load_from: str = None, vocab_size: int = 10000, max_example_len: int = 128, batch_size: int = 16, num_stopwords: int = 250, mask_output_len: int = 4, ): self.char_dict: Dict[str, int] = {} self.char_rev: Dict[int, str] = {} self.token_dict: Dict[str, int] = {} self.token_rev: Dict[str, int] = {} self.vocab_size = vocab_size self.max_example_len = max_example_len self.batch_size = batch_size self.num_stopwords = num_stopwords self.mask_output_len = mask_output_len self.tokenizer_fit = False self.tokenizer = Tokenizer(BPE(unk_token="[UNK]")) self.tokenizer.pre_tokenizer = Whitespace() self.tokenizer.normalizer = Sequence( [NFD(), Lowercase(), StripAccents()]) self.tok_trainer = BpeTrainer(special_tokens=["[UNK]", "[MASK]"], vocab_size=self.vocab_size) if load_from: self._load(load_from)
def get_daily_dialog_tokenizer(tokenizer_location=None): ''' Get the daily dialog tokenizer. Trains a new one if no location is provided :param tokenizer_location: Json containing information about the tokenizer. :return: ''' if tokenizer_location: tokenizer = Tokenizer.from_file(tokenizer_location, ) tokenizer.enable_padding() return tokenizer else: dataset_train = datasets.load_dataset("daily_dialog", split="train", ) utterances = [special_tokens["sep_token"].join(dialogue["dialog"]) for dialogue in dataset_train] trainer = WordPieceTrainer( vocab_size = 2048, special_tokens = token_utils.special_tokens.values() ) custom_tokenizer = Tokenizer(WordPiece(unk_token=special_tokens["unk_token"], )) custom_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) custom_tokenizer.pre_tokenizer = Whitespace() custom_tokenizer.train_from_iterator(utterances, trainer, ) custom_tokenizer.enable_padding() # Write every dialogue to file location = './daily_dialog/' custom_tokenizer.save(location + "tokenizer.json") return custom_tokenizer
def __init__(self, path): self.path = path text_paths = [ str(x) for x in Path("./dataset/corpus/").glob("**/*.txt") ] savedpath = "./dataset/tok_model/MALBERT-vocab.txt" if os.path.exists(savedpath): self.tokenizer = tokenizers.BertWordPieceTokenizer( "./dataset/tok_model/MALBERT-vocab.txt", ) else: self.tokenizer = tokenizers.BertWordPieceTokenizer() self.tokenizer.train( files=text_paths, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=14200) self.tokenizer.save_model("./dataset/tok_model", "MALBERT") self.tokenizer.enable_truncation(max_length=512) self.pretokenizer = tokenizers.pre_tokenizers.Sequence( [Whitespace(), Digits(individual_digits=True)]) self.vocab = self.tokenizer.get_vocab() self.mask_index = self.vocab.get("[MASK]") self.pad_index = self.vocab.get("[PAD]") self.eos_index = self.vocab.get("[SEP]") self.sos_index = self.vocab.get("[CLS]") self.unk_index = self.vocab.get("[UNK]")
def train_tokenizer(lang, dataset, vocab_size): # Byte-pair encoding tokenizer = Tokenizer(BPE(unk_token='[UNK]')) # trainer trainer = BpeTrainer( special_tokens=['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'], vocab_size=vocab_size) # pre tokenizer with whitespace tokenizer.pre_tokenizer = Whitespace() # train tokenizer.train_from_iterator(dataset[lang], trainer) # post process start/end tokens tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) return tokenizer
def train(): """Source: https://huggingface.co/docs/tokenizers/pipeline""" base = os.environ['DATA_ROOT'] corpus_path = base + 'MimicIII/Encounters/Text/' bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) # input to tokenizer.encode() goes through this pipeline: # normalization, pre-tokenization, model, post-processing bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[("[CLS]", 1), ("[SEP]", 2)]) files = [str(file) for file in Path(corpus_path).glob('*.txt')] trainer = WordPieceTrainer( vocab_size=30522, show_progress=True, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) bert_tokenizer.train(files, trainer) os.mkdir('./Tokenizer') bert_tokenizer.save("Tokenizer/tokenizer.json")
def main() -> None: args = parse_args() special_tokens = list(SPECIAL_TOKENS) if args.reserved < len(special_tokens): raise AssertionError( f"number of reserved tokens should be more than number of f{len(special_tokens)}") for i in range(len(special_tokens), args.reserved): special_tokens.append(f"[unused{i:03d}]") all_filenames = get_all_filenames(args.input) # "C:\Users\demianmedich\data\wiki\20191120.en\pp_cased/" tokenizer = Tokenizer(get_model(args.model)) tokenizer.normalizer = normalizers.Sequence([ NFKC(), StripAccents(), Lowercase() ]) tokenizer.pre_tokenizer = Whitespace() trainer = WordPieceTrainer( vocab_size=args.vocab_size, special_tokens=special_tokens) tokenizer.train(trainer, all_filenames) model_files = tokenizer.model.save() sys.exit(0)
def load_janome_tokenizer(tokenizer_path) -> Tokenizer: tokenizer = Tokenizer.from_file(str(tokenizer_path)) tokenizer.pre_tokenizer = Sequence([ Whitespace(), PreTokenizer.custom(JanomePreTokenizer()), ]) tokenizer.decoder = Decoder.custom(JanomeDecoder()) return tokenizer
def tokenize_cards( files=['./dataset/cards_train.txt', './dataset/cards_val.txt'], output_dir='./tokenizer'): tokenizer = ByteLevelBPETokenizer() tokenizer.pre_tokenizer = Whitespace() tokenizer.train(files=files, special_tokens=SPECIAL_TOKENS + OTHER_TOKENS) tokenizer.save_model(output_dir)
def tokenize(dt, df): from tokenizers import Tokenizer from tokenizers.models import WordPiece from tokenizers.pre_tokenizers import Whitespace from tokenizers import normalizers from tokenizers.normalizers import NFD, StripAccents from tokenizers.processors import TemplateProcessing from tokenizers.trainers import WordPieceTrainer #print(df.head()) #print(df.query_text.head()) #print(df.query_text.to_list()) #exit(0) data_source = get_data_source(dt) token_file = Path(data_dir, data_source, 'tokenizer.json') vocab_file = Path(data_dir, data_source, 'vocab.txt') corpus_file = Path(data_dir, data_source, 'corpus.txt') if vocab_file.is_file() and corpus_file.is_file(): print("corpus and token files already generated") return 0 bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer.normalizer = normalizers.Sequence([NFD(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) trainer = WordPieceTrainer( vocab_size=25000, min_frequency=3, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) #print(df.query_text.to_list()) bert_tokenizer.train_from_iterator(df.query_text.to_list(), trainer) bert_tokenizer.save(str(token_file)) #bert_tokenizer.save_model(directory=data_dir,name='tokenizer') df['range_idx'] = range(0, df.shape[0]) df['mean_rank_group'] = df.groupby( ['session_id'], sort=False)['range_idx'].transform(np.mean) df['separate_column'] = df['range_idx'] < df['mean_rank_group'] df = df.groupby(['session_id', 'separate_column'], as_index=False, sort=False)['query_text'].agg( ' '.join).drop(columns='separate_column') #df = df.groupby('session_id').agg({'query_text':' '.join}).reset_index() df.query_text.to_csv(corpus_file, header=False, index=False) with open(token_file) as token_f: jdata = json.load(token_f) with open(vocab_file, "w") as fd: for k in jdata['model']['vocab'].keys(): print(k, file=fd)
def inference(checkpoint_path, hyperparameters_path, tokenizer_path, input='In 1691 Moscow established ', generated_length=64, random_selection=True): # Iitialize tokenizer and model from files tokenizer = Tokenizer(BPE(unk_token="[UNK]")) tokenizer.pre_tokenizer = Whitespace() tokenizer = tokenizer.from_file(tokenizer_path) #tokenizer2 = Tokenizer(BPE(unk_token="[UNK]")) #tokenizer2.pre_tokenizer2 = Whitespace() #tokenizer2 = Tokenizer.from_file("example/tokenizer.json") #initialize model model = LMModel.load_from_checkpoint(checkpoint_path=checkpoint_path, hparams_file=hyperparameters_path) # Tokenize input sample encoded_sample = tokenizer.encode(input).ids for i in range(generated_length): input_ids = torch.unsqueeze(torch.tensor(encoded_sample, dtype=torch.long), axis=0) # Inference output, attn = model(input_ids) last_word = output[0][-1] if not random_selection: # Pick highest probability token from probability distributions prediction = torch.argmax(output, axis=2).squeeze(axis=0).tolist()[-1] else: # Pick Tokens acording to their probabilities prediction = torch.multinomial(torch.softmax(last_word, 0)**10, 1)[0] # Add prediciton to sequence encoded_sample.append(prediction) # Detokenize output sample decoded_output = tokenizer.decode(encoded_sample) #decoded_output2 = tokenizer2.decode(encoded_sample) output_tokens = [tokenizer.id_to_token(int(id)) for id in encoded_sample] #output_tokens2 = [tokenizer2.id_to_token(int(id)) for id in encoded_sample] #print('\n========================\n ORIGINAL BPE \n========================') #print(output_tokens2, decoded_output2, sep='\n') #print('\n========================\n MODIFIED BPE \n========================') return decoded_output, output_tokens, attn
def _prepare_pipeline(self): self.tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) self.tokenizer.pre_tokenizer = Whitespace() self.tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) self.tokenizer.enable_padding( pad_id=self.__class__.SPECIAL_TOKENS.index("[PAD]"), pad_token="[PAD]")
def main(args): # from tokenizers import BertWordPieceTokenizer from tokenizers import Tokenizer from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece()) # bert_tokenizer = Tokenizer(MBartTokenizer()) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() # from tokenizers.processors import TemplateProcessing # # bert_tokenizer.post_processor = TemplateProcessing( # single="[CLS] $A [SEP]", # pair="[CLS] $A [SEP] $B:1 [SEP]:1", # special_tokens=[ # ("[CLS]", 1), # ("[SEP]", 2), # ], # ) from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=10000, special_tokens=["[UNK]", "[CLS]", "[PAD]", "[MASK]"] # "[SEP]", "[PAD]", "[MASK]"] ) files = glob.glob(args.text_raw_files_pattern) bert_tokenizer.train(trainer, files) os.makedirs(args.output_dir, exist_ok=True) model_files = bert_tokenizer.model.save(args.output_dir, "bert-tokenizer-kr") bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]") bert_tokenizer.save(os.path.join(args.output_dir, "bert-tokenizer-kr.json"))
def load_tokenizer(vocab='./tokenizer/vocab.json', merges='./tokenizer/merges.txt', gpt=False, load_from=None): if gpt: if load_from: tokenizer = GPT2Tokenizer.from_pretrained(load_from) else: tokenizer = GPT2Tokenizer( vocab, merges, bos_token=CARD_BEGIN, eos_token=CARD_END, sep_token=CARD_END, unk_token=UNK, pad_token=CARD_PAD, mask_token=CARD_MASK, padding_side="left" ) else: tokenizer = ByteLevelBPETokenizer(vocab, merges) tokenizer.add_special_tokens(SPECIAL_TOKENS + OTHER_TOKENS) tokenizer.mask_token = CARD_MASK tokenizer.pre_tokenizer = Whitespace() return tokenizer
def create_tokenizer(sentence_list): filename = f'temp_{time.strftime("%Y%m%d-%H%M%S")}.txt' with open(filename, 'w') as f: for s in sentence_list: f.write(f'{s}\n') tokenizer = Tokenizer(WordPiece()) tokenizer.pre_tokenizer = Whitespace() tokenizer.decoder = decoders.WordPiece() tokenizer.enable_padding(pad_token='[PAD]', pad_id=0) trainer = WordPieceTrainer( vocab_size=3000, special_tokens=['[PAD]', '[S]', '[/S]', '[UNK]']) tokenizer.train(trainer, [filename]) os.remove(filename) return tokenizer
class GloveTokenizer(): """Glove Tokenizer that maps from text -> vectors. GloVe embeddings are not contextual (are just a lookup) so in the huggingface paradim, they are actually part of the tokenizer. Doing this properly would involve creating a pre-trained tokenizer that is formed using something to the effect of: ```python import tokenizer from tokenizers.models import WordPiece tokenizer = tokenizers.Tokenizer(WordPiece()) # add normalizer/pre_tokenizer # ... ``` ... but this feels like overkill here. I'll make my own version here but it will quite a bit slower. """ def __init__(self, glove_fp=None, embedding=None): assert glove_fp or embedding super().__init__() self.pre_token = Whitespace() # i.e. tokenize on whitespace + punct if glove_fp: self.model = GloveEmbedding(location=glove_fp) else: self.model = embedding def normalize(self, text): """Lowercases text.""" return text.lower() def to_words(self, text): """Sentence to tokens (on whitespace)""" tokens = self.pre_token.pre_tokenize(text) # tokens is List[Tuple] where tuple: (token, position). return [x for x, _ in tokens] def __call__(self, text: str): text = self.normalize(text) words = self.to_words(text) ids = self.model.words_to_ids(words) ids = torch.LongTensor(ids) return self.model(ids), words
def create_train_bpe_tokenizer( bpe_vocab_size, asr_text_filepath='asr.txt', ttx_text_filepath='ttx.txt', save_tokenizer=True, tokenizer_filename=".\\data\\tokenizer-test.json"): tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=bpe_vocab_size) tokenizer.pre_tokenizer = Whitespace() files = [asr_text_filepath, ttx_text_filepath] files = [file for file in files if file] # Get rid of None's tokenizer.train(files, trainer) if save_tokenizer: tokenizer.save(tokenizer_filename) return tokenizer
def main(args): # copy from https://github.com/xinjli/allosaurus ipa0 = [ 'I', 'a', 'aː', 'ã', 'ă', 'b', 'bʲ', 'bʲj', 'bʷ', 'bʼ', 'bː', 'b̞', 'b̤', 'b̥', 'c', 'd', 'dʒ', 'dʲ', 'dː', 'd̚', 'd̥', 'd̪', 'd̯', 'd͡z', 'd͡ʑ', 'd͡ʒ', 'd͡ʒː', 'd͡ʒ̤', 'e', 'eː', 'e̞', 'f', 'fʲ', 'fʷ', 'fː', 'g', 'gʲ', 'gʲj', 'gʷ', 'gː', 'h', 'hʷ', 'i', 'ij', 'iː', 'i̞', 'i̥', 'i̯', 'j', 'k', 'kx', 'kʰ', 'kʲ', 'kʲj', 'kʷ', 'kʷʼ', 'kʼ', 'kː', 'k̟ʲ', 'k̟̚', 'k͡p̚', 'l', 'lʲ', 'lː', 'l̪', 'm', 'mʲ', 'mʲj', 'mʷ', 'mː', 'n', 'nj', 'nʲ', 'nː', 'n̪', 'n̺', 'o', 'oː', 'o̞', 'o̥', 'p', 'pf', 'pʰ', 'pʲ', 'pʲj', 'pʷ', 'pʷʼ', 'pʼ', 'pː', 'p̚', 'q', 'r', 'rː', 's', 'sʲ', 'sʼ', 'sː', 's̪', 't', 'ts', 'tsʰ', 'tɕ', 'tɕʰ', 'tʂ', 'tʂʰ', 'tʃ', 'tʰ', 'tʲ', 'tʷʼ', 'tʼ', 'tː', 't̚', 't̪', 't̪ʰ', 't̪̚', 't͡s', 't͡sʼ', 't͡ɕ', 't͡ɬ', 't͡ʃ', 't͡ʃʲ', 't͡ʃʼ', 't͡ʃː', 'u', 'uə', 'uː', 'u͡w', 'v', 'vʲ', 'vʷ', 'vː', 'v̞', 'v̞ʲ', 'w', 'x', 'x̟ʲ', 'y', 'z', 'zj', 'zʲ', 'z̪', 'ä', 'æ', 'ç', 'çj', 'ð', 'ø', 'ŋ', 'ŋ̟', 'ŋ͡m', 'œ', 'œ̃', 'ɐ', 'ɐ̞', 'ɑ', 'ɑ̱', 'ɒ', 'ɓ', 'ɔ', 'ɔ̃', 'ɕ', 'ɕː', 'ɖ̤', 'ɗ', 'ə', 'ɛ', 'ɛ̃', 'ɟ', 'ɡ', 'ɡʲ', 'ɡ̤', 'ɡ̥', 'ɣ', 'ɣj', 'ɤ', 'ɤɐ̞', 'ɤ̆', 'ɥ', 'ɦ', 'ɨ', 'ɪ', 'ɫ', 'ɯ', 'ɯ̟', 'ɯ̥', 'ɰ', 'ɱ', 'ɲ', 'ɳ', 'ɴ', 'ɵ', 'ɸ', 'ɹ', 'ɹ̩', 'ɻ', 'ɻ̩', 'ɽ', 'ɾ', 'ɾj', 'ɾʲ', 'ɾ̠', 'ʀ', 'ʁ', 'ʁ̝', 'ʂ', 'ʃ', 'ʃʲː', 'ʃ͡ɣ', 'ʈ', 'ʉ̞', 'ʊ', 'ʋ', 'ʋʲ', 'ʌ', 'ʎ', 'ʏ', 'ʐ', 'ʑ', 'ʒ', 'ʒ͡ɣ', 'ʔ', 'ʝ', 'ː', 'β', 'β̞', 'θ', 'χ', 'ә', 'ḁ' ] ipa1, ipa2, ipa3 = ipa0.copy(), ipa0.copy(), ipa0.copy() random.shuffle(ipa1) random.shuffle(ipa2) random.shuffle(ipa3) # randomly joined to form training data passage0 = ' '.join(ipa0) passage1 = ' '.join(ipa1) passage2 = ' '.join(ipa2) passage3 = ' '.join(ipa3) data = [passage0, passage1, passage2, passage3] # setup tokenizer = Tokenizer(WordLevel(unk_token="<unk>")) # trainer = WordLevelTrainer(vocab_size=300, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) trainer = WordLevelTrainer( vocab_size=300, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) tokenizer.pre_tokenizer = Whitespace() # train the tokenizer tokenizer.train_from_iterator(data, trainer=trainer) tokenizer.save(args.outdir + '/ipa_tokenizer.json')
def get_tokenizer_trainer(): # START init_tokenizer from tokenizers import Tokenizer from tokenizers.models import BPE tokenizer = Tokenizer(BPE()) # END init_tokenizer # START init_trainer from tokenizers.trainers import BpeTrainer trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) # END init_trainer # START init_pretok from tokenizers.pre_tokenizers import Whitespace tokenizer.pre_tokenizer = Whitespace() # END init_pretok return tokenizer, trainer
def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 8000) -> Tokenizer: bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) trainer = WordPieceTrainer( vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] ) bert_tokenizer.train_from_iterator(sentences, trainer=trainer) if serialize_path: bert_tokenizer.save(serialize_path) return bert_tokenizer
def get_french_vocab(model_name): root = Path(os.getcwd()).parent.parent.parent french_corpus = "Datasets/corpora/fr/text" fr_corpus_path = os.path.join(root, french_corpus) files = [] for dir_ in os.listdir(fr_corpus_path): fr_corpus_dir = os.path.join(fr_corpus_path, dir_) for text_file in os.listdir(fr_corpus_dir): text_file = os.path.join(fr_corpus_dir, text_file) files.append(text_file) tokenizer = ByteLevelBPETokenizer(add_prefix_space=True) tokenizer.pre_tokenizer = Whitespace() tokenizer.train(files, vocab_size=20000, min_frequency=2, show_progress=True, special_tokens=["<sos>", "<pad>", "<eos>", "<unk>"]) print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens) tokenizer.save(model_name)
def train_wordpiece_bert(): """ Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html """ from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() from tokenizers.processors import TemplateProcessing bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) bert_tokenizer.decoder = decoders.WordPiece() from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) files = [ DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a for a in ["test", "train", "valid"] ] bert_tokenizer.train(files, trainer) bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json') return bert_tokenizer
def build_new_vocab(): tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = Whitespace() # files = [f"/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-{split}-factoid-7b.json" for split in ["train_split", "dev"]] files = "/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-train-factoid-7b.json" with open(files) as f: file = json.load(f) contexts = [] for question in file['data']: for paragraph in question['paragraphs']: contexts.append(paragraph['context']) tokenizer.train_from_iterator(contexts, trainer) additional_vocab = [k for k, v in tokenizer.get_vocab().items()] tokenizer.save("tokenizer/tokenizer-bioasq.json") return additional_vocab
def __init__(self, tokenizer: PreTrainedTokenizerFast, cased: bool, target_vocab_size: int): """ Args: tokenizer: A Rust-based 🤗 Tokenizer cased: If False, ignore uppercases in corpus target_vocab_size: Size of augmented vocabulary Raises: ValueError: If :obj:`target_vocab_size` is larger or equal to the existing vocabulary of :obj:`tokenizer` RuntimeError: If :obj:`tokenizer` uses an unsupported tokenization model """ if target_vocab_size <= tokenizer.vocab_size: raise ValueError( f"Ensure that `target_vocab_size` is larger than tokenizer's vocab size." ) self.tokenizer = tokenizer self.cased = cased self.target_vocab_size = target_vocab_size self.model_cls: Type[ BaseTokenizer] = tokenizer.backend_tokenizer.model.__class__ # Instantiate rust tokenizer rust_tokenizer = Tokenizer(self.model_cls()) if not cased: rust_tokenizer.normalizer = Lowercase() rust_tokenizer.pre_tokenizer = Whitespace() self.rust_tokenizer = rust_tokenizer # Instantiate the appropriate Trainer based on `self.model` (i.e. BPE, WordPiece, etc) trainer_cls = self.supported_trainers.get(self.model_cls, None) if trainer_cls is None: raise RuntimeError(f"{self.model_cls} is not supported") self.trainer = trainer_cls( vocab_size=self.target_vocab_size, special_tokens=list(self.tokenizer.special_tokens_map.values()), )
"CHEF_DO": 7, "MOVE_CONTENTS": 8, } k = len(output_vocab) with open("../data/res2idx.json", 'r') as f: for w, i in json.load(f).items(): output_vocab[w] = k k += 1 with open("../data/arg2idx.json", 'r') as f: for w, i in json.load(f).items(): output_vocab[w.replace('-', '_')] = k k += 1 output_vocab = {w: i for i, w in enumerate(output_vocab)} output_tokenizer = Tokenizer(WordLevel(output_vocab, )) output_tokenizer.pre_tokenizer = Whitespace() t = output_tokenizer.encode_batch( ["SERVE MOVE_CONTENTS", "SERVE MOVE_CONTENTS PUT"]) # print (t) csv_file = '../data/seq2seq_4335716.csv' input_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') input_tokenizer.bos_token = input_tokenizer.cls_token input_tokenizer.eos_token = input_tokenizer.sep_token val_data = load_dataset('csv', data_files=csv_file, split='train[90%:]') train_data = load_dataset('csv', data_files=csv_file, split='train[:90%]') # print(val_data) # print(train_data)
def test_instantiate(self): assert Whitespace() is not None assert isinstance(Whitespace(), PreTokenizer) assert isinstance(Whitespace(), Whitespace) assert isinstance(pickle.loads(pickle.dumps(Whitespace())), Whitespace)
print(v) # Start vocabulary with all standard special tokens. (PAD=0!) vocab = {} for special_token in ["[PAD]", "[CLS]", "[SEP]", "[UNK]", "[MASK]", "[BOS]", "[EOS]"]: vocab[special_token] = len(vocab) # Add other words - if not already present. for w in words: if w not in vocab: vocab[w] = len(vocab) print(vocab) # New tokenizer. init_tokenizer = BertWordPieceTokenizer(vocab=vocab) init_tokenizer.normalizer = Sequence([Replace("(", " ( "), Replace(")", " ) "), BertNormalizer()]) init_tokenizer.pre_tokenizer = Whitespace() #init_tokenizer.pad_token_id = vocab["[PAD]"] #print("Created tokenizer: ", init_tokenizer) # Save the created tokenizer. init_tokenizer.save(decoder_tokenizer_path) print("Tokenizer saved to: ", decoder_tokenizer_path) # Load from tokenizer file. tokenizer = PreTrainedTokenizerFast(tokenizer_file=decoder_tokenizer_path) tokenizer.add_special_tokens({'pad_token': '[PAD]', 'cls_token': '[CLS]', 'sep_token': '[SEP]', 'unk_token': '[UNK]', 'mask_token': '[MASK]', 'bos_token': '[BOS]', 'eos_token': '[EOS]' }) print(f"\nLoaded tokenizer vocabulary ({len(tokenizer.get_vocab())}):\n" + "-"*50) for k, v in tokenizer.get_vocab().items():
from tokenizers import Tokenizer from tokenizers.models import BPE, WordPiece, Unigram from tokenizers.normalizers import Lowercase from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer TRAIN_DATA_PATH = 'data/data_fusion_train.parquet' OUTPUT_PATH = 'data/tokenizers/' # Prepare data train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name']) item_names = train.item_name.drop_duplicates().tolist() # WordPiece tokenizer tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=70000) tokenizer.train_from_iterator(item_names, trainer) tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json')) # BPE tokenizer tokenizer = Tokenizer(BPE(unk_token="[UNK]")) tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()]) tokenizer.normalizer = Lowercase() trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
from tokenizers import Tokenizer, normalizers from tokenizers.models import WordLevel from tokenizers.pre_tokenizers import Whitespace from tokenizers.trainers import WordLevelTrainer from tokenizers.processors import TemplateProcessing t = Tokenizer(WordLevel(unk_token="[UNK]")) t.pre_tokenizer = Whitespace() trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"]) t.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", # , # pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 2), ("[SEP]", 3), ]) files = ['tok-train-shuf-tgt.tsv'] t.train(files, trainer) t.save("code_tokenizer.json")