Ejemplo n.º 1
0
    def __init__(self,
                 vocab_size=25000,
                 min_freq=5,
                 lang="en",
                 files=[None, None]) -> None:
        """

        Args:
            vocab_size: (int)
            min_freq: minimum frequency
            lang: 
            files: (List[str]) ["vocab.json", "merge.txt"]
        """
        super(BPETokenizer, self).__init__()

        self.tokenizer = Tokenizer(BPE(files[0], files[1]))

        self.lang = lang
        self.trainer = BpeTrainer(vocab_size=vocab_size,
                                  min_frequency=min_freq,
                                  special_tokens=["[PAD]", "[SEP]"],
                                  initial_alphabet=ByteLevel.alphabet())

        # https://huggingface.co/docs/tokenizers/python/latest/components.html#normalizers
        self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()])
        # https://huggingface.co/docs/tokenizers/python/latest/components.html#pre-tokenizers
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()
    def test_can_modify(self):
        pretok = ByteLevel(add_prefix_space=False)

        assert pretok.add_prefix_space == False

        # Modify these
        pretok.add_prefix_space = True
        assert pretok.add_prefix_space == True
Ejemplo n.º 3
0
 def __init__(self):
     self.tokenizer = Tokenizer(BPE())
     self.tokenizer.normalizer = Sequence([
         NFKC()
     ])
     self.tokenizer.pre_tokenizer = ByteLevel()
     self.tokenizer.decoder = ByteLevelDecoder()
def tokenizer_pipeline():
    """
    specific pipeline for Cebuano Corpus tokenization 
    - Uses a Byte pair encoding (BPE) tokenizer
    """
    tokenizer = Tokenizer(BPE())

    # string normalization
    tokenizer.normalizer = Sequence([NFD(), StripAccents(), Lowercase()])
    tokenizer.pre_tokenizer = ByteLevel()
    tokenizer.decoder = ByteLevelDecoder()
    return tokenizer
Ejemplo n.º 5
0
 def bpe_train(self, paths):
     trainer = BpeTrainer(vocab_size=50000,
                          show_progress=True,
                          inital_alphabet=ByteLevel.alphabet(),
                          special_tokens=[
                              "<s>",
                              "<pad>",
                              "</s>",
                              "<unk>",
                              "<mask>",
                              "<company>",
                              "<label>",
                              "<category>",
                              "<review>",
                          ])
     self.tokenizer.train(trainer, paths)
Ejemplo n.º 6
0
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True):
    """
    if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining)
    else: load the tokenizer if it exists
    """
    assert dataset in VALID_DATASETS
    assert style in VALID_TOKENIZATIONS

    tpath_expected = default_tpath(dataset, style)

    train = True
    if not force_retrain and os.path.isfile(tpath_expected):
        tokenizer = Tokenizer.from_file(tpath_expected)
        train = False
    else:
        print('%s tokenizer file does not exist; training new tokenizer' %
              tpath_expected)

    if train:

        # load data associated with one of the valid datasets (from /data/ directory)
        datafiles = load_dataset(dataset)

        # Steps for each algo (e.g. BPE):
        # - init Tokenizer using algo
        # - specify algo specific trainer
        # - specify any pre-processing of text (will affect decoding)
        #   see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders
        # - different training calls if its the arxiv dataset or wikitext
        #   see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/

        if style == 'BPE':
            tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
            trainer = BpeTrainer(
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
            tokenizer.pre_tokenizer = ByteLevel()
            if dataset == 'arxiv':
                tokenizer.train_from_iterator(datafiles, trainer=trainer)
            else:
                tokenizer.train(datafiles, trainer=trainer)
            tokenizer.decoder = decoders.ByteLevel()

        else:
            assert style == 'WordLevel'
            tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
            trainer = WordLevelTrainer(
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
            tokenizer.pre_tokenizer = Whitespace()
            if dataset == 'arxiv':
                tokenizer.train_from_iterator(datafiles, trainer=trainer)
            else:
                tokenizer.train(datafiles, trainer=trainer)
            tokenizer.decoder = decoders.WordPiece(
            )  # WordPiece seems to work (adds back spaces)

        # Save to tokenizers directory
        tokenizer.save(tpath_expected)

    # Generate vocab object based on tokenizer.decoder() method
    # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere...
    # Features we need to match:
    #   from torchtext.legacy.vocab import Vocab as RetiredVocab
    #   ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...)
    #   data = [torch.tensor([vocab[token] for token in tokenizer(item)],
    #                         dtype=torch.long) for item in raw_text_iter]
    #   tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long)
    #   running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])])
    #   unk_index = vocab.unk_index
    vocab = None

    return tokenizer, vocab
Ejemplo n.º 7
0
 def test_instantiate(self):
     assert ByteLevel() is not None
     assert ByteLevel(add_prefix_space=True) is not None
     assert ByteLevel(add_prefix_space=False) is not None
     assert isinstance(ByteLevel(), PreTokenizer)
Ejemplo n.º 8
0
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel

# First we create an empty Byte-Pair Encoding model (i.e. not trained model)
tokenizer = Tokenizer(BPE())

# Then we enable lower-casing and unicode-normalization
# The Sequence normalizer allows us to combine multiple Normalizer that will be
# executed in order.
tokenizer.normalizer = Sequence([NFKC(), Lowercase()])

# Our tokenizer also needs a pre-tokenizer responsible for converting the input
# to a ByteLevel representation.
tokenizer.pre_tokenizer = ByteLevel()

# And finally, let's plug a decoder so we can recover from a tokenized input
# to the original one
tokenizer.decoder = ByteLevelDecoder()

from tokenizers.trainers import BpeTrainer

# We initialize our trainer, giving him the details about the vocabulary we want
# to generate
trainer = BpeTrainer(vocab_size=25000,
                     show_progress=True,
                     initial_alphabet=ByteLevel.alphabet())

tokenizer.train(trainer,
                ["/Volumes/750GB-HDD/root/Question-Answering/pyData/big.txt"])
 def test_manual_reload(self):
     byte_level = ByteLevel()
     state = json.loads(byte_level.__getstate__())
     reloaded = ByteLevel(**state)
     assert isinstance(reloaded, ByteLevel)
    tokenizer = Tokenizer(BPE())

    # string normalization
    tokenizer.normalizer = Sequence([NFD(), StripAccents(), Lowercase()])
    tokenizer.pre_tokenizer = ByteLevel()
    tokenizer.decoder = ByteLevelDecoder()
    return tokenizer


if __name__ == "__main__":
    # preparing corpus for wiki
    en_vocab_size = 50257
    wiki_txt = load_text_file_json('text/AA/wiki_00.json', 'text')
    write_text_file(wiki_txt, 'wiki-corpus.txt')

    corpus_files = {
        'wiki-corpus': 'wiki-corpus.txt',
        'oscar-corpus': 'shuff-dedup/ceb/ceb_dedup.txt'
    }

    # define a trainer for the tokenizer
    trainer = BpeTrainer(vocab_size=en_vocab_size,
                         show_progress=True,
                         initial_alphabet=ByteLevel.alphabet(),
                         special_tokens=['<|endoftext|>', '<pad>'])

    for corpus, path in corpus_files.items():
        tokenizer = tokenizer_pipeline()
        tokenizer.train([path], trainer)
        tokenizer.save(f'model/{corpus}-tokenizer.json')