def normalizer(self, proto): list_normalizers = [ normalizers.Replace("``", '"'), normalizers.Replace("''", '"'), ] if not self.original_tokenizer.keep_accents: list_normalizers.append(normalizers.NFKD()) list_normalizers.append(normalizers.StripAccents()) if self.original_tokenizer.do_lower_case: list_normalizers.append(normalizers.Lowercase()) precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap list_normalizers.append(normalizers.Precompiled(precompiled_charsmap)) list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " ")) return normalizers.Sequence(list_normalizers)
def train_tokenizer() -> Tuple[tokenizers.Tokenizer, Generator, int]: tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>")) tokenizer.decoder = decoders.WordPiece() tokenizer.normalizer = normalizers.Sequence([ normalizers.NFD(), # NFD unicode normalizer normalizers.Lowercase(), normalizers.StripAccents() ]) tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([ pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=False) ]) tokenizer.post_processor = processors.TemplateProcessing( single="$A </s>", pair="$A </s> [SEP] <s> $B:1", special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)]) dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="test") def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i:i + batch_size]["text"] tokenizer.train_from_iterator( batch_iterator(), trainer=trainers.WordPieceTrainer( vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>", "</s>"])) def generator(): for record in dataset: if record['text'].strip() != '': for sentence in sent_tokenizer(record['text']): yield sentence data = tf.data.Dataset.from_generator(generator, output_signature=(tf.TensorSpec( shape=(None), dtype=tf.string))) data = data.map(tf.strings.strip, num_parallel_calls=tf.data.experimental.AUTOTUNE) return tokenizer, data
def train_tokenizer() -> tokenizers.Tokenizer: tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>")) tokenizer.decoder = decoders.WordPiece() tokenizer.normalizer = normalizers.Sequence([ normalizers.NFD(), # NFD unicode normalizer normalizers.Lowercase(), normalizers.StripAccents() ]) tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([ pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=False) ]) tokenizer.post_processor = processors.TemplateProcessing( single="$A </s>", pair="$A </s> [SEP] <s> $B:1", special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)]) # dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation") dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="validation") def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i:i + batch_size]["text"] tokenizer.train_from_iterator( batch_iterator(), trainer=trainers.WordPieceTrainer( vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>", "</s>"])) def generator(): for record in dataset: if record['text'].strip() != '': yield record['text'] return tokenizer, generator