def __init__(self, vocab_file, merges_file, unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", pad_to_max_length=False, add_prefix_space=False, max_length=None, stride=0, truncation_strategy="longest_first", **kwargs): super(GPT2TokenizerFast, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) self._tokenizer = tk.Tokenizer( tk.models.BPE.from_files(vocab_file, merges_file)) self._update_special_tokens() self._tokenizer.with_pre_tokenizer( tk.pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space)) self._tokenizer.with_decoder(tk.decoders.ByteLevel.new()) if max_length: self._tokenizer.with_truncation(max_length, stride=stride, strategy=truncation_strategy) self._tokenizer.with_padding( max_length=max_length if pad_to_max_length else None, direction=self.padding_side, pad_id=self.pad_token_id if self.pad_token_id is not None else 0, pad_type_id=self.pad_token_type_id, pad_token=self.pad_token if self.pad_token is not None else "", ) self._decoder = tk.decoders.ByteLevel.new()
def __init__( self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", tokenize_chinese_chars=True, max_length=None, pad_to_max_length=False, stride=0, truncation_strategy="longest_first", add_special_tokens=True, **kwargs ): super(BertTokenizerFast, self).__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs, ) self._tokenizer = tk.Tokenizer(tk.models.WordPiece.from_files(vocab_file, unk_token=unk_token)) self._update_special_tokens() self._tokenizer.with_pre_tokenizer( tk.pre_tokenizers.BertPreTokenizer.new( do_basic_tokenize=do_basic_tokenize, do_lower_case=do_lower_case, tokenize_chinese_chars=tokenize_chinese_chars, never_split=never_split if never_split is not None else [], ) ) self._tokenizer.with_decoder(tk.decoders.WordPiece.new()) if add_special_tokens: self._tokenizer.with_post_processor( tk.processors.BertProcessing.new( (sep_token, self._tokenizer.token_to_id(sep_token)), (cls_token, self._tokenizer.token_to_id(cls_token)), ) ) if max_length is not None: self._tokenizer.with_truncation(max_length, stride=stride, strategy=truncation_strategy) self._tokenizer.with_padding( max_length=max_length if pad_to_max_length else None, direction=self.padding_side, pad_id=self.pad_token_id, pad_type_id=self.pad_token_type_id, pad_token=self.pad_token, ) self._decoder = tk.decoders.WordPiece.new()
def test_works_in_simple_pipeline(self): pretok = self.dict.pre_tokenizer() vocab = { "[UNK]": 0, "京都": 1, "に": 2, "行く": 3 } tok = tokenizers.Tokenizer(WordLevel(vocab, unk_token="[UNK]")) tok.pre_tokenizer = pretok res = tok.encode("京都へ行く") self.assertEqual(res.ids, [1, 0, 3])
def test_with_handler(self): def _handler(index, sentence: tokenizers.NormalizedString, ml: MorphemeList): return [tokenizers.NormalizedString(ml[0].part_of_speech()[0]), tokenizers.NormalizedString(str(len(ml)))] pretok = self.dict.pre_tokenizer(sudachipy.SplitMode.A, handler=_handler) vocab = { "[UNK]": 0, "名詞": 6, "4": 7, } tok = tokenizers.Tokenizer(WordLevel(vocab, unk_token="[UNK]")) tok.pre_tokenizer = pretok res = tok.encode("外国人参政権") self.assertEqual(res.ids, [6, 7])
def make_tokenizer(k: int): """ Make tokenizer for k-mer gene sequences. """ keys = extras + list("".join(token) for token in itertools.product(*(nucleotides for _ in range(k)))) values = range(len(keys)) vocab = dict(zip(keys, values)) tokenizer = tokenizers.Tokenizer( tokenizers.models.WordLevel(vocab=vocab, unk_token=unknown)) tokenizer.enable_padding(pad_token=padding) tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.WhitespaceSplit() return tokenizer
def test_works_with_different_split_mode(self): pretok = self.dict.pre_tokenizer(sudachipy.SplitMode.A) vocab = { "[UNK]": 0, "外国": 1, "参政": 2, "権": 3, "人": 5, "外国人参政権": 4 } tok = tokenizers.Tokenizer(WordLevel(vocab, unk_token="[UNK]")) tok.pre_tokenizer = pretok res = tok.encode("外国人参政権") self.assertEqual(res.ids, [1, 5, 2, 3])
def train_tokenizer() -> Tuple[tokenizers.Tokenizer, Generator, int]: tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>")) tokenizer.decoder = decoders.WordPiece() tokenizer.normalizer = normalizers.Sequence([ normalizers.NFD(), # NFD unicode normalizer normalizers.Lowercase(), normalizers.StripAccents() ]) tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([ pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=False) ]) tokenizer.post_processor = processors.TemplateProcessing( single="$A </s>", pair="$A </s> [SEP] <s> $B:1", special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)]) dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="test") def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i:i + batch_size]["text"] tokenizer.train_from_iterator( batch_iterator(), trainer=trainers.WordPieceTrainer( vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>", "</s>"])) def generator(): for record in dataset: if record['text'].strip() != '': for sentence in sent_tokenizer(record['text']): yield sentence data = tf.data.Dataset.from_generator(generator, output_signature=(tf.TensorSpec( shape=(None), dtype=tf.string))) data = data.map(tf.strings.strip, num_parallel_calls=tf.data.experimental.AUTOTUNE) return tokenizer, data
def train_tokenizer() -> tokenizers.Tokenizer: tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>")) tokenizer.decoder = decoders.WordPiece() tokenizer.normalizer = normalizers.Sequence([ normalizers.NFD(), # NFD unicode normalizer normalizers.Lowercase(), normalizers.StripAccents() ]) tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([ pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=False) ]) tokenizer.post_processor = processors.TemplateProcessing( single="$A </s>", pair="$A </s> [SEP] <s> $B:1", special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)]) # dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation") dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="validation") def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i:i + batch_size]["text"] tokenizer.train_from_iterator( batch_iterator(), trainer=trainers.WordPieceTrainer( vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>", "</s>"])) def generator(): for record in dataset: if record['text'].strip() != '': yield record['text'] return tokenizer, generator
def __init__( self, word_id_map={}, pad_token_id=None, unk_token_id=None, unk_token="[UNK]", sep_token="[SEP]", cls_token="[CLS]", pad_token="[PAD]", lowercase: bool = False, unicode_normalizer=None, ): if pad_token_id: word_id_map[pad_token] = pad_token_id if unk_token_id: word_id_map[unk_token] = unk_token_id max_id = max(word_id_map.values()) for idx, token in enumerate((unk_token, sep_token, cls_token, pad_token)): if token not in word_id_map: word_id_map[token] = max_id + idx # HuggingFace tokenizer expects a path to a `*.json` file to read the # vocab from. I think this is kind of a silly constraint, but for now # we write the vocab to a temporary file before initialization. word_list_file = tempfile.NamedTemporaryFile() word_list_file.write(json.dumps(word_id_map).encode()) word_level = hf_tokenizers.models.WordLevel( word_list_file.name, unk_token=str(unk_token) ) tokenizer = hf_tokenizers.Tokenizer(word_level) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [ hf_tokenizers.normalizers.unicode_normalizer_from_str( unicode_normalizer ) ] if lowercase: normalizers += [hf_tokenizers.normalizers.Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = hf_tokenizers.normalizers.Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = hf_tokenizers.pre_tokenizers.WhitespaceSplit() sep_token_id = tokenizer.token_to_id(str(sep_token)) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(str(cls_token)) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = hf_tokenizers.processors.BertProcessing( (str(sep_token), sep_token_id), (str(cls_token), cls_token_id) ) parameters = { "model": "WordLevel", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "lowercase": lowercase, "unicode_normalizer": unicode_normalizer, } self.unk_token = unk_token self.pad_token = pad_token super().__init__(tokenizer, parameters)
train_csv_df.to_csv(config['train_csv'], index=False, header=True) # Labelled test CSV file print("Save labelled csv for inference ", config['test_csv']) test_csv_df.to_csv(config['test_csv'], index=False, header=True) print("Setup tokenizers...") unknown_word = 'unknown_word' full_set = set(list(count_vector.vocabulary_.keys()) + list(word_list.keys())) #full_set = set(list(count_vector.vocabulary_.keys())) print("Number of words : (This has to be in config)", len(full_set) + 2) vocab = { w: i for i, w in enumerate([unknown_word, 'dumb_token'] + list(full_set)) } tokenizer = tokenizers.Tokenizer(WordLevel(vocab, unknown_word)) tokenizer.pre_tokenizer = Whitespace() print("Use padding length ", config['padding_length']) tokenizer.enable_padding(length=int(config['padding_length'])) # Save tokenizer recompute = False if recompute: print("Save tokenizer ", config['token_config']) tokenizer.save(config['token_config']) tokenizer = tokenizers.Tokenizer.from_file(config['token_config'])
pre_tokenizer = Whitespace() tokenized_texts = [[w for w, _ in pre_tokenizer.pre_tokenize_str(t)] for t in texts] c = Counter() for text in tokenized_texts: c.update(text) token2id = { word: i + 1 for i, (word, count) in enumerate(c.most_common(max_vocab_size)) } # usually, UNK is assigned index 0 or 1 token2id[unk_token] = 0 tokenizer = tokenizers.Tokenizer(WordLevel(token2id, unk_token)) tokenizer.pre_tokenizer = pre_tokenizer return tokenizer def accuracy(probs, targets): """Computes accuracy given predicted probabilities and expected labels. Args: probs: torch.FloatTensor[batch_size, 1], probabilities of a positive class targets: torch.LongTensor[batch_size, 1], true classes Returns: 0 <= float <= 1, proportion of correct predictions """ predictions = (probs >= 0.5).flatten()