def test_train_from_iterator(self):
        text = ["A first sentence", "Another sentence", "And a last one"]
        tokenizer = BertWordPieceTokenizer()
        tokenizer.train_from_iterator(text, show_progress=False)

        output = tokenizer.encode("A sentence")
        assert output.tokens == ["a", "sentence"]
Example #2
0
def load_from_dataset_bert_tokenizer(
    dataset_name="wikitext",
    dataset_config_name="wikitext-2-raw-v1",
    vocab_size=30000
):
    """
    Adapted from:
    https://github.com/huggingface/tokenizers/tree/master/bindings/python/examples
    If used frequently, save the model to avoid reloading

    tokenizer 0.10.0 required to train from dataset, but not supported by stable version
    of hugging face or datasets yet
    """

    from datasets import load_dataset

    tokenizer = BertWordPieceTokenizer(
        strip_accents=True,
        # following arguments are all same as default, listed for clarity
        clean_text=True,
        handle_chinese_chars=True,
        lowercase=True,
    )

    dataset = load_dataset(dataset_name, dataset_config_name)

    # Build an iterator over this dataset
    def batch_iterator():
        batch_length = 1000
        for i in range(0, len(dataset["train"]), batch_length):
            yield dataset["train"][i : i + batch_length]["text"]

    # Train
    tokenizer.train_from_iterator(
        batch_iterator(),
        length=len(dataset["train"]),
        # following arguments are all same as default, listed for clarity
        vocab_size=vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
        limit_alphabet=1000,
        wordpieces_prefix="##",
    )

    return tokenizer
Example #3
0
def train_tokenizer(file_iterator):

    # Initialize an empty tokenizer
    tokenizer = BertWordPieceTokenizer(
        clean_text=True,
        handle_chinese_chars=True,
        strip_accents=True,
        lowercase=True,
    )

    # And then train
    #tokenizer.train_from_iterator(
    tokenizer.train_from_iterator(
        file_iterator,
        vocab_size=1000,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
        limit_alphabet=1000,
        wordpieces_prefix="##",
    )

    # Save the files
    tokenizer.save_model(args.out, args.name)