Example #1
0
def get_tokenizer(args):

    tokenizer = Tokenizer(models.BPE())
    tokenizer.normalizer = Sequence(
        [NFKC(), Replace('\r', ''),
         Replace('\n', ' ')])
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    tokenizer.decoder = decoders.ByteLevel()

    if os.path.isdir(args.tokenizer_dir):
        vocab_fn = os.path.join(args.tokenizer_dir, 'vocab.json')
        merge_fn = os.path.join(args.tokenizer_dir, 'merges.txt')
        tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn)
    else:
        os.makedirs(args.tokenizer_dir)
        trainer = trainers.BpeTrainer(
            vocab_size=args.vocab_size,
            special_tokens=["[UNK]", "[PAD]", "[BOS]", "[EOS]"])
        files = [
            os.path.join(args.data_dir, split)
            for split in ['train.json', 'val.json', 'test.json']
        ]
        tokenizer.train(files=files, trainer=trainer)
        tokenizer.model.save(args.tokenizer_dir)

    return tokenizer
Example #2
0
    def get_tokenizer(self, tokenizer_dir):

        tokenizer = Tokenizer(models.BPE())
        tokenizer.normalizer = Sequence(
            [NFKC(), Replace('\r', ''),
             Replace('\n', ' ')])
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
        tokenizer.decoder = decoders.ByteLevel()

        vocab_fn = os.path.join(tokenizer_dir, 'vocab.json')
        merge_fn = os.path.join(tokenizer_dir, 'merges.txt')
        tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn)
        tokenizer.add_special_tokens(['[UNK]', '[PAD]', '[BOS]', '[EOS]'])
        return tokenizer
Example #3
0
def main(args):
    # from tokenizers import BertWordPieceTokenizer
    from tokenizers import Tokenizer
    from tokenizers.models import WordPiece

    bert_tokenizer = Tokenizer(WordPiece())
    # bert_tokenizer = Tokenizer(MBartTokenizer())

    from tokenizers import normalizers

    from tokenizers.normalizers import Lowercase, NFD, StripAccents

    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])

    from tokenizers.pre_tokenizers import Whitespace

    bert_tokenizer.pre_tokenizer = Whitespace()

    # from tokenizers.processors import TemplateProcessing
    #
    # bert_tokenizer.post_processor = TemplateProcessing(
    #     single="[CLS] $A [SEP]",
    #     pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    #     special_tokens=[
    #         ("[CLS]", 1),
    #         ("[SEP]", 2),
    #     ],
    # )

    from tokenizers.trainers import WordPieceTrainer

    trainer = WordPieceTrainer(
        vocab_size=10000,
        special_tokens=["[UNK]", "[CLS]", "[PAD]",
                        "[MASK]"]  # "[SEP]", "[PAD]", "[MASK]"]
    )
    files = glob.glob(args.text_raw_files_pattern)
    bert_tokenizer.train(trainer, files)

    os.makedirs(args.output_dir, exist_ok=True)
    model_files = bert_tokenizer.model.save(args.output_dir,
                                            "bert-tokenizer-kr")
    bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]")

    bert_tokenizer.save(os.path.join(args.output_dir,
                                     "bert-tokenizer-kr.json"))
Example #4
0
tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([
    # NFKC(),
    Lowercase()
])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()
trainer = BpeTrainer(vocab_size=int(vocab_size), show_progress=True)
tokenizer.train(trainer, [f"{proc_path}/names.txt"])

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

tokenizer.model.save(proc_path)

tokenizer.model = BPE.from_file(f'{proc_path}/vocab.json',
                                f'{proc_path}/merges.txt')

with open(f"{proc_path}/vocab.json", "r") as f:
    bpe_vocab = json.load(f)

bpe_vocab_idx = {v: k for k, v in bpe_vocab.items()}

char_map = {k: v + 1 for k, v in bpe_vocab.items() if len(k) == 1}
print(f"Char map size: {len(char_map)}\n")

MAX_LEN_OF_WORD = max([len(w) for w in bpe_vocab])
print(f"Max length of word: {MAX_LEN_OF_WORD}\n")

if ZERO_PAD:
    word_map = {
        k: [char_map[c] for c in k] + [0] * (MAX_LEN_OF_WORD - len(k))
Example #5
0
tokenizer.train(trainer,
                ["/Volumes/750GB-HDD/root/Question-Answering/pyData/big.txt"])

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

# Et voilà ! You trained your very first tokenizer from scratch using tokenizers.
# Of course, this covers only the basics, and you may want to have a look at the
# add_special_tokens or special_tokens parameters on the Trainer class, but the
# overall process should be very similar.

# You will see the generated files in the output.
tokenizer.model.save('/Volumes/750GB-HDD/root/Question-Answering/pyData')

# Let's tokenizer a simple input
tokenizer.model = BPE(pyData + 'vocab.json', pyData + 'merges.txt')
encoding = tokenizer.encode("This is a simple input to be tokenized")

print("Encoded string: {}".format(encoding.tokens))

decoded = tokenizer.decode(encoding.ids)
print("Decoded string: {}".format(decoded))

# Getting started with transformers
import torch
from transformers import AutoModel, AutoTokenizer, BertTokenizer

torch.set_grad_enabled(False)

# Store the model we want to use
MODEL_NAME = "bert-base-cased"
Example #6
0
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Sequence, NFD, Lowercase, Strip


def train(dataset_path,
          output_dir='data/tokenizer/',
          vocab_size=30_000,
          min_frequency=3):

    trainer = WordPieceTrainer(vocab_size=vocab_size,
                               min_frequency=min_frequency,
                               special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]'])
    tokenizer = Tokenizer(WordPiece())
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.normalizer = Sequence([NFD(), Lowercase(), Strip()])

    files = [dataset_path]
    tokenizer.train(trainer, files)

    files = tokenizer.model.save(output_dir)
    tokenizer.model = WordPiece.from_file(*files, unk_token='[UNK]')

    tokenizer.save(f'{output_dir}tokenizer.json')


if __name__ == '__main__':
    fire.Fire(train)
Example #7
0
print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

# %%
SAVE_PATH = Path('tokenizers')
PATH = SAVE_PATH / 'bytelevel-bpe-tokenizer-model'
if not PATH.exists():
    PATH.mkdir(parents=True, exist_ok=True)

# %%
# 保存模型
tokenizer.model.save(str(PATH))

# %%
# 在需要时重新载入使用(可与transformers无缝衔接配合使用)
# 注意,实践中这里需要按训练时的情况重新构建好tokenizer再载入model
tokenizer.model = BPE(vocab=str(PATH / 'vocab.json'),
                      merges=str(PATH / 'merges.txt'))

# %%
# 编码/解码
encoded = \
    tokenizer.encode("This is a simple input to be tokenized.")
print("Encoded string: {}".format(encoded.tokens))

decoded = \
    tokenizer.decode(encoded.ids)
print("Decoded string: {}".format(decoded))

# %%
from tokenizers import ByteLevelBPETokenizer
# tokenizer提供了一些经典tokenization算法的高级封装
# 譬如可以用`ByteLevelBPETokenizer`简单地重写上面的内容
Example #8
0
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Sequence, Lowercase, Strip


def train(dataset_path,
          output_dir='data/tokenizer/',
          vocab_size=30_000,
          min_frequency=3):

    trainer = BpeTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]'])
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.normalizer = Sequence([Lowercase(), Strip()])

    files = [dataset_path]
    tokenizer.train(trainer, files)

    files = tokenizer.model.save(output_dir)
    tokenizer.model = BPE.from_file(*files, unk_token='[UNK]')

    tokenizer.save(f'{output_dir}/tokenizer.json')


if __name__ == '__main__':
    fire.Fire(train)