Beispiel #1
0
def train_tokenizer(args):
    """[summary]

    Arguments:
        args {[dictionary]} -- [arguments객체]
    """

    # Tokenizer train
    morpheme_func = None

    if args.tokenizer.pretokenizer_type == "khaiii":
        api = KhaiiiApi()
        morpheme_func = api.analyze
    elif args.tokenizer.pretokenizer_type == "mecab":
        mecab = Mecab()
        morpheme_func = mecab.morphs

    # tokenizer-type", type=str, choices=["bbpe", "cbpe", "wp"], default="bbpe"
    if args.tokenizer.tokenizer_type == "bbpe":
        # tokenizer = BytelevelBPETokenizer()
        tokenizer = Tokenizer(BPE())
        # tokenizer.pre_tokenizer = BertPreTokenizer()
        trainer = BpeTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )
    elif args.tokenizer.tokenizer_type == "cbpe":
        tokenizer = Tokenizer(BPE())
        tokenizer.pre_tokenizer = CharDelimiterSplit
        trainer = BpeTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )
    elif args.tokenizer.tokenizer_type == "wp":
        tokenizer = Tokenizer(WordPiece())
        # tokenizer.pre_tokenizer = Whitespace
        trainer = WordPieceTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )

    tokenizer.train_from_iterator(get_pretokenize_generator(morpheme_func))

    tokenizer.save(f"../vocab/{args.tokenizer.tokenizer_type}.vocab")
    test_string = "안녕하세요 이것은 테스트입니다. 구름은 하늘에 떠 있고 우리는 여기있어"
    output = tokenizer.encode(test_string)
    print(f"output:{output}")
    print(f"tokens:{output.tokens}")
    print(f"ids   :{output.ids}")
    print(f"offset:{output.offsets}")
    print(f"decode:{tokenizer.decode(output.ids)}")

    datasets = get_datasets(args.tokenizer.data_path)

    for line in datasets:
        print(line)
        break
Beispiel #2
0
def train_tokenizer(langs, dataset, vocab_size):
    """Train a tokenizer on given list of languages.
    Reserves a special token for each language which is
    [LANG] where LANG is the language tag. These are assigned
    to tokens 5, 6, ..., len(langs) + 4.
    """

    # Byte-pair encoding
    tokenizer = Tokenizer(BPE(unk_token='[UNK]'))

    # trainer
    lang_tokens = ['[' + lang + ']' for lang in langs]
    special_tokens = ['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'] + lang_tokens
    trainer = BpeTrainer(
        special_tokens=special_tokens,
        vocab_size=vocab_size)

    # normalise and pre tokenize
    tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    tokenizer.decoder = decoders.ByteLevel()

    # create iterator and train
    iterator = _MultilingualIterator(dataset, langs)
    tokenizer.train_from_iterator(iterator, trainer)

    # post process start/end tokens
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", tokenizer.token_to_id("[CLS]")),
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ], )
    return tokenizer
Beispiel #3
0
    def __init__(self,
                 vocab_size=25000,
                 min_freq=5,
                 lang="en",
                 files=[None, None]) -> None:
        """

        Args:
            vocab_size: (int)
            min_freq: minimum frequency
            lang: 
            files: (List[str]) ["vocab.json", "merge.txt"]
        """
        super(BPETokenizer, self).__init__()

        self.tokenizer = Tokenizer(BPE(files[0], files[1]))

        self.lang = lang
        self.trainer = BpeTrainer(vocab_size=vocab_size,
                                  min_frequency=min_freq,
                                  special_tokens=["[PAD]", "[SEP]"],
                                  initial_alphabet=ByteLevel.alphabet())

        # https://huggingface.co/docs/tokenizers/python/latest/components.html#normalizers
        self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()])
        # https://huggingface.co/docs/tokenizers/python/latest/components.html#pre-tokenizers
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()
Beispiel #4
0
 def bpe_train(self, paths):
     trainer = BpeTrainer(
         vocab_size=50000,
         show_progress=True,
         inital_alphabet=ByteLevel.alphabet(),
         special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
     self.tokenizer.train(paths, trainer)
Beispiel #5
0
    def train(
        self,
        files: Union[str, List[str]],
        vocab_size: int = 30000,
        min_frequency: int = 2,
        special_tokens: List[str] = ["<unk>"],
        limit_alphabet: int = 1000,
        initial_alphabet: List[str] = [],
        suffix: Optional[str] = "</w>",
        show_progress: bool = True,
    ):
        """ Train the model using the given files """

        trainer = BpeTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            special_tokens=special_tokens,
            limit_alphabet=limit_alphabet,
            initial_alphabet=initial_alphabet,
            end_of_word_suffix=suffix,
            show_progress=show_progress,
        )
        if isinstance(files, str):
            files = [files]
        self._tokenizer.train(trainer, files)
Beispiel #6
0
 def __init__(
     self,
     load_from: str = None,
     vocab_size: int = 10000,
     max_example_len: int = 128,
     batch_size: int = 16,
     num_stopwords: int = 250,
     mask_output_len: int = 4,
 ):
     self.char_dict: Dict[str, int] = {}
     self.char_rev: Dict[int, str] = {}
     self.token_dict: Dict[str, int] = {}
     self.token_rev: Dict[str, int] = {}
     self.vocab_size = vocab_size
     self.max_example_len = max_example_len
     self.batch_size = batch_size
     self.num_stopwords = num_stopwords
     self.mask_output_len = mask_output_len
     self.tokenizer_fit = False
     self.tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
     self.tokenizer.pre_tokenizer = Whitespace()
     self.tokenizer.normalizer = Sequence(
         [NFD(), Lowercase(), StripAccents()])
     self.tok_trainer = BpeTrainer(special_tokens=["[UNK]", "[MASK]"],
                                   vocab_size=self.vocab_size)
     if load_from:
         self._load(load_from)
Beispiel #7
0
def train_tokenizer(lang, dataset, vocab_size):
    # Byte-pair encoding
    tokenizer = Tokenizer(BPE(unk_token='[UNK]'))

    # trainer
    trainer = BpeTrainer(
        special_tokens=['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'],
        vocab_size=vocab_size)

    # pre tokenizer with whitespace
    tokenizer.pre_tokenizer = Whitespace()

    # train
    tokenizer.train_from_iterator(dataset[lang], trainer)

    # post process start/end tokens
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", tokenizer.token_to_id("[CLS]")),
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ],
    )
    return tokenizer
Beispiel #8
0
 def prepare_trainer(self):
     return BpeTrainer(vocab_size=30000, show_progress=True, min_frequency=2, special_tokens=[
         "<s>",
         "<pad>",
         "</s>",
         "<unk>",
         "<mask>",
     ])
Beispiel #9
0
def generate_tokenizer(equations, output, vocab_size):
    from tokenizers import Tokenizer, pre_tokenizers
    from tokenizers.models import BPE
    from tokenizers.trainers import BpeTrainer
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    trainer = BpeTrainer(special_tokens=["[PAD]", "[BOS]", "[EOS]"], vocab_size=vocab_size, show_progress=True)
    tokenizer.train(trainer, equations)
    tokenizer.save(path=output, pretty=False)
    def load_or_train_tokenizer(file_paths, tokenizer_mode_path):
        '''
        Tries to load saved text tokenizer
        If there is none, trains the new tokenizer and saves is
        '''

        if not os.path.exists(tokenizer_mode_path):
            print('Tokenizer model not found, training one')

            from tokenizers.models import BPE
            from tokenizers import Tokenizer
            from tokenizers.decoders import ByteLevel as ByteLevelDecoder
            from tokenizers.normalizers import NFKC, Sequence
            from tokenizers.pre_tokenizers import ByteLevel
            from tokenizers.trainers import BpeTrainer

            tokenizer = Tokenizer(BPE())
            tokenizer.normalizer = Sequence([
                NFKC()
            ])
            tokenizer.pre_tokenizer = ByteLevel()
            tokenizer.decoder = ByteLevelDecoder()

            trainer = BpeTrainer(
                vocab_size=50000,
                show_progress=True,
                inital_alphabet=ByteLevel.alphabet(),
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>"
                ]
            )
            tokenizer.train(file_paths, trainer)

            if not os.path.exists(tokenizer_mode_path):
                os.makedirs(tokenizer_mode_path)
            tokenizer.model.save(tokenizer_mode_path, None)

        print('Loading trained tokenizer model')

        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_mode_path)
        tokenizer.add_special_tokens({
            'eos_token': '</s>',
            'bos_token': '<s>',
            'unk_token': '<unk>',
            'pad_token': '<pad>',
            'mask_token': '<mask>'
        })

        return tokenizer
Beispiel #11
0
 def bpe_train(self, paths):
     trainer = BpeTrainer(vocab_size=50000,
                          show_progress=True,
                          inital_alphabet=ByteLevel.alphabet(),
                          special_tokens=[
                              "<s>",
                              "<pad>",
                              "</s>",
                              "<unk>",
                              "<mask>",
                              "<company>",
                              "<label>",
                              "<category>",
                              "<review>",
                          ])
     self.tokenizer.train(trainer, paths)
def create_train_bpe_tokenizer(
        bpe_vocab_size,
        asr_text_filepath='asr.txt',
        ttx_text_filepath='ttx.txt',
        save_tokenizer=True,
        tokenizer_filename=".\\data\\tokenizer-test.json"):
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    trainer = BpeTrainer(
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
        vocab_size=bpe_vocab_size)
    tokenizer.pre_tokenizer = Whitespace()
    files = [asr_text_filepath, ttx_text_filepath]
    files = [file for file in files if file]  # Get rid of None's
    tokenizer.train(files, trainer)

    if save_tokenizer:
        tokenizer.save(tokenizer_filename)

    return tokenizer
Beispiel #13
0
    def get_tokenizer_trainer():
        # START init_tokenizer
        from tokenizers import Tokenizer
        from tokenizers.models import BPE

        tokenizer = Tokenizer(BPE())
        # END init_tokenizer
        # START init_trainer
        from tokenizers.trainers import BpeTrainer

        trainer = BpeTrainer(
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
        # END init_trainer
        # START init_pretok
        from tokenizers.pre_tokenizers import Whitespace

        tokenizer.pre_tokenizer = Whitespace()
        # END init_pretok
        return tokenizer, trainer
Beispiel #14
0
def build_new_vocab():

    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

    trainer = BpeTrainer(
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    tokenizer.pre_tokenizer = Whitespace()

    # files = [f"/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-{split}-factoid-7b.json" for split in ["train_split", "dev"]]
    files = "/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-train-factoid-7b.json"

    with open(files) as f:
        file = json.load(f)
    contexts = []
    for question in file['data']:
        for paragraph in question['paragraphs']:
            contexts.append(paragraph['context'])

    tokenizer.train_from_iterator(contexts, trainer)
    additional_vocab = [k for k, v in tokenizer.get_vocab().items()]

    tokenizer.save("tokenizer/tokenizer-bioasq.json")
    return additional_vocab
Beispiel #15
0
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

path_data = "../../ml-datasets/wmt14/tokenizer/"

path_train_src = "../../ml-datasets/wmt14/train.en"
path_train_tgt = "../../ml-datasets/wmt14/train.de"

tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([
    NFKC(),
    Lowercase()
])

tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()

trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet(),
                     min_frequency=2, special_tokens=["<pad>", "<s>", "</s>", "<unk>", "<mask>", ])
tokenizer.train(trainer, [path_train_src, path_train_tgt])

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

tokenizer.model.save(path_data)
    tokenizer = Tokenizer(BPE())

    # string normalization
    tokenizer.normalizer = Sequence([NFD(), StripAccents(), Lowercase()])
    tokenizer.pre_tokenizer = ByteLevel()
    tokenizer.decoder = ByteLevelDecoder()
    return tokenizer


if __name__ == "__main__":
    # preparing corpus for wiki
    en_vocab_size = 50257
    wiki_txt = load_text_file_json('text/AA/wiki_00.json', 'text')
    write_text_file(wiki_txt, 'wiki-corpus.txt')

    corpus_files = {
        'wiki-corpus': 'wiki-corpus.txt',
        'oscar-corpus': 'shuff-dedup/ceb/ceb_dedup.txt'
    }

    # define a trainer for the tokenizer
    trainer = BpeTrainer(vocab_size=en_vocab_size,
                         show_progress=True,
                         initial_alphabet=ByteLevel.alphabet(),
                         special_tokens=['<|endoftext|>', '<pad>'])

    for corpus, path in corpus_files.items():
        tokenizer = tokenizer_pipeline()
        tokenizer.train([path], trainer)
        tokenizer.save(f'model/{corpus}-tokenizer.json')
Beispiel #17
0
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
#from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
#tokenizer.pre_tokenizer = Whitespace()
files = ['./processed/processed_wiki_ko.txt']
tokenizer.train(files, trainer)

tokenizer.save("wiki_tokenizer.json")
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = WordPieceTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=70000)
tokenizer.train_from_iterator(item_names, trainer)
tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json'))

# BPE tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=60000)
tokenizer.train_from_iterator(item_names, trainer)
tokenizer.save(os.path.join(OUTPUT_PATH, 'bpe_60k.json'))

# Unigram tokenizer
tokenizer = Tokenizer(Unigram())
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = UnigramTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=50000)
tokenizer.train_from_iterator(item_names, trainer)
tokenizer.save(os.path.join(OUTPUT_PATH, 'unigram_50k.json'))
Beispiel #19
0
tokenizer.normalizer = Sequence([NFKC(), Lowercase()])

# Our tokenizer also needs a pre-tokenizer responsible for converting the input
# to a ByteLevel representation.
tokenizer.pre_tokenizer = ByteLevel()

# And finally, let's plug a decoder so we can recover from a tokenized input
# to the original one
tokenizer.decoder = ByteLevelDecoder()

from tokenizers.trainers import BpeTrainer

# We initialize our trainer, giving him the details about the vocabulary we want
# to generate
trainer = BpeTrainer(vocab_size=25000,
                     show_progress=True,
                     initial_alphabet=ByteLevel.alphabet())

tokenizer.train(trainer,
                ["/Volumes/750GB-HDD/root/Question-Answering/pyData/big.txt"])

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

# Et voilà ! You trained your very first tokenizer from scratch using tokenizers.
# Of course, this covers only the basics, and you may want to have a look at the
# add_special_tokens or special_tokens parameters on the Trainer class, but the
# overall process should be very similar.

# You will see the generated files in the output.
tokenizer.model.save('/Volumes/750GB-HDD/root/Question-Answering/pyData')
Beispiel #20
0
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True):
    """
    if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining)
    else: load the tokenizer if it exists
    """
    assert dataset in VALID_DATASETS
    assert style in VALID_TOKENIZATIONS

    tpath_expected = default_tpath(dataset, style)

    train = True
    if not force_retrain and os.path.isfile(tpath_expected):
        tokenizer = Tokenizer.from_file(tpath_expected)
        train = False
    else:
        print('%s tokenizer file does not exist; training new tokenizer' %
              tpath_expected)

    if train:

        # load data associated with one of the valid datasets (from /data/ directory)
        datafiles = load_dataset(dataset)

        # Steps for each algo (e.g. BPE):
        # - init Tokenizer using algo
        # - specify algo specific trainer
        # - specify any pre-processing of text (will affect decoding)
        #   see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders
        # - different training calls if its the arxiv dataset or wikitext
        #   see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/

        if style == 'BPE':
            tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
            trainer = BpeTrainer(
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
            tokenizer.pre_tokenizer = ByteLevel()
            if dataset == 'arxiv':
                tokenizer.train_from_iterator(datafiles, trainer=trainer)
            else:
                tokenizer.train(datafiles, trainer=trainer)
            tokenizer.decoder = decoders.ByteLevel()

        else:
            assert style == 'WordLevel'
            tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
            trainer = WordLevelTrainer(
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
            tokenizer.pre_tokenizer = Whitespace()
            if dataset == 'arxiv':
                tokenizer.train_from_iterator(datafiles, trainer=trainer)
            else:
                tokenizer.train(datafiles, trainer=trainer)
            tokenizer.decoder = decoders.WordPiece(
            )  # WordPiece seems to work (adds back spaces)

        # Save to tokenizers directory
        tokenizer.save(tpath_expected)

    # Generate vocab object based on tokenizer.decoder() method
    # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere...
    # Features we need to match:
    #   from torchtext.legacy.vocab import Vocab as RetiredVocab
    #   ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...)
    #   data = [torch.tensor([vocab[token] for token in tokenizer(item)],
    #                         dtype=torch.long) for item in raw_text_iter]
    #   tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long)
    #   running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])])
    #   unk_index = vocab.unk_index
    vocab = None

    return tokenizer, vocab
Beispiel #21
0
    parser.add_argument('--languages',
                        help='dataset languages to tokenize',
                        type=str,
                        required=True)
    parser.add_argument('--tokenizer-out',
                        help='tokenizer output file',
                        type=str,
                        required=True)
    parser.add_argument('--special-tokens',
                        type=str,
                        default="[UNK],[SEP],[PAD],[MASK],[ECHO],[TRANSLATE]")
    args = parser.parse_args()

    # translation_dataset = load_dataset(args.dataset, args.languages)
    # translation_datase    t.set_format(columns='translation')
    translation_dataset = NewsCommentaryTranslationDataset()

    tokenizer_file = args.tokenizer_out
    special_tokens = args.special_tokens.split(",")

    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    trainer = BpeTrainer(special_tokens=special_tokens)

    all_translation_sentences = map(
        lambda x: [x['translation'][lang] for lang in x['translation'].keys()],
        translation_dataset)

    tokenizer.train_from_iterator(all_translation_sentences, trainer=trainer)

    tokenizer.save(tokenizer_file)
Beispiel #22
0
print(f"Unique names: {len(names)}\n")

name_words = {n: " ".join(split_to_words(n)) for n in names}

with open(f"{proc_path}/names.txt", "w") as f:
    f.write("\n".join(list(name_words.values())))
    # f.write("\n".join(words))

tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([
    # NFKC(),
    Lowercase()
])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()
trainer = BpeTrainer(vocab_size=int(vocab_size), show_progress=True)
tokenizer.train(trainer, [f"{proc_path}/names.txt"])

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

tokenizer.model.save(proc_path)

tokenizer.model = BPE.from_file(f'{proc_path}/vocab.json',
                                f'{proc_path}/merges.txt')

with open(f"{proc_path}/vocab.json", "r") as f:
    bpe_vocab = json.load(f)

bpe_vocab_idx = {v: k for k, v in bpe_vocab.items()}

char_map = {k: v + 1 for k, v in bpe_vocab.items() if len(k) == 1}
if __name__ == "__main__":

    args = parser.parse_args()

    for f in ['ewe-fon', "ewe", "fon"]:

        # instantiate tokenizer
        tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

        # splitting our inputs into words
        tokenizer.pre_tokenizer = Whitespace()

        # instantiate trainer
        trainer = BpeTrainer(
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
            min_frequency=2)

        # get files
        files = [os.path.join(args.data_dir, f"{f}-sentences.txt")]

        # train tokenizer
        tokenizer.train(files=files, trainer=trainer)

        # save tokenizer config file
        tokenizer.save(os.path.join(args.save_dir, f"tokenizer-{f}.json"))

    # load trained tokenizers
    for f in ['ewe-fon', "ewe", "fon"]:
        print(f'Using {f} tokenizer : \n')
        try:
Beispiel #24
0
import fire
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Sequence, Lowercase, Strip


def train(dataset_path,
          output_dir='data/tokenizer/',
          vocab_size=30_000,
          min_frequency=3):

    trainer = BpeTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]'])
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.normalizer = Sequence([Lowercase(), Strip()])

    files = [dataset_path]
    tokenizer.train(trainer, files)

    files = tokenizer.model.save(output_dir)
    tokenizer.model = BPE.from_file(*files, unk_token='[UNK]')

    tokenizer.save(f'{output_dir}/tokenizer.json')


if __name__ == '__main__':