Example #1
0
def train_tokenizer(data_file_paths):
    t = BertWordPieceTokenizer()
    t.train(
        files=data_file_paths,
        vocab_size=vocab_size,
        min_frequency=2,
        show_progress=True,
        limit_alphabet=1000,
    )
    return t
Example #2
0
 def load_WordPiece(self):
     tokenizer = BertWordPieceTokenizer(vocab=os.path.join(
         self.config['tokenizer_path'], 'tokenizer-vocab.txt'),
                                        clean_text=True,
                                        handle_chinese_chars=True,
                                        strip_accents=True,
                                        lowercase=True,
                                        wordpieces_prefix='##')
     return tokenizer
Example #3
0
def train_custom_tokenizer(files: List[str], tokenizer_file: str,
                           **kwargs) -> BertWordPieceTokenizer:
    """ Tokenizerの学習・保存処理:custom PreTokenizer付きのTokenizerを学習・保存する。
    """

    tokenizer = BertWordPieceTokenizer(
        handle_chinese_chars=False,  # for japanese
        strip_accents=False,  # for japanese
    )
    tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(
        MecabPreTokenizer())

    # 与えられたコーパスファイル集合からサブワード分割を学習
    tokenizer.train(files, **kwargs)

    # vocab情報に加えて、前処理等パラメータ情報を含んだトークナイザ設定のJSONを保存
    # NOTE: Pythonで書かれたcustom PreTokenizerはシリアライズできないので、RustベースのPreTokenizerをダミー注入してシリアライズ
    # JSONにはダミーのPreTokenizerが記録されるので、ロード時にcustom PreTokenizerを再設定する必要がある。
    tokenizer._tokenizer.pre_tokenizer = BertPreTokenizer()
    tokenizer.save(tokenizer_file)

    # (Optional) .txt形式のvocabファイルは f"vocab-{filename}.txt" で保存される(外部の処理で欲しい場合)
    filename = "wordpiece"
    model_files = tokenizer._tokenizer.model.save(
        str(pathlib.Path(tokenizer_file).parent), filename)

    return tokenizer
Example #4
0
def train_bert_tokenizer(sentences: List[str],
                         serialize_path: str,
                         vocab_size: int = 6000) -> BertWordPieceTokenizer:
    tokenizer = BertWordPieceTokenizer(
        clean_text=True,
        handle_chinese_chars=False,
        strip_accents=False,
        lowercase=False,
    )
    tokenizer.train_from_iterator(
        sentences,
        vocab_size=vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
        limit_alphabet=500,
        wordpieces_prefix="##",
    )

    # Save the files--first write out the vocab, then use BertTokenizer's save_pretrained
    tokenizer.save_model(serialize_path)
    bert_tokenizer = BertTokenizer.from_pretrained(serialize_path + os.sep +
                                                   "vocab.txt")
    bert_tokenizer.save_pretrained(serialize_path)
    os.rename(serialize_path + os.sep + "tokenizer_config.json",
              serialize_path + os.sep + "config.json")
    return bert_tokenizer
Example #5
0
 def training_WordPiece(self):
     tokenizer = BertWordPieceTokenizer(vocab=None,
                                        clean_text=True,
                                        handle_chinese_chars=True,
                                        strip_accents=True,
                                        lowercase=True,
                                        wordpieces_prefix='##')
     tokenizer.train([
         os.path.join(self.corpus_dir_path, file_path)
         for file_path in os.listdir(self.corpus_dir_path)
         if 'mecab' in file_path
     ],
                     limit_alphabet=self.config['limit_alphabet'],
                     vocab_size=self.config['vocab_size'],
                     special_tokens=self.get_special_tokens())
     print('training WordPiece is finished!')
     tokenizer.save_model(self.config['tokenizer_path'], prefix='tokenizer')
     print('tokenizer is saved in {}'.format(
         os.path.join(self.config['tokenizer_path'],
                      'tokenizer-vocab.txt')))
Example #6
0
    def test_encode_formats(self, bert_files):
        with pytest.deprecated_call():
            tokenizer = BertWordPieceTokenizer(bert_files["vocab"])

        # Encode
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
        output = tokenizer.encode("my name is john", "pair")
        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
        output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
        output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True)
        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]

        # Encode batch
        result_single = [
            ["[CLS]", "my", "name", "is", "john", "[SEP]"],
            ["[CLS]", "my", "name", "is", "georges", "[SEP]"],
        ]
        result_pair = [
            ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"],
            ["[CLS]", "my", "name", "is", "georges", "[SEP]", "pair", "[SEP]"],
        ]

        def format(encodings):
            return [e.tokens for e in encodings]

        def test_single(input, is_pretokenized=False):
            output = tokenizer.encode_batch(input, is_pretokenized=is_pretokenized)
            assert format(output) == result_single

        def test_pair(input, is_pretokenized=False):
            output = tokenizer.encode_batch(input, is_pretokenized=is_pretokenized)
            assert format(output) == result_pair

        # Classic inputs

        # Lists
        test_single(["My name is John", "My name is Georges"])
        test_pair([("my name is john", "pair"), ("my name is georges", "pair")])
        test_pair([["my name is john", "pair"], ["my name is georges", "pair"]])

        # Tuples
        test_single(("My name is John", "My name is Georges"))
        test_pair((("My name is John", "pair"), ("My name is Georges", "pair")))

        # Numpy
        test_single(np.array(["My name is John", "My name is Georges"]))
        test_pair(np.array([("My name is John", "pair"), ("My name is Georges", "pair")]))
        test_pair(np.array([["My name is John", "pair"], ["My name is Georges", "pair"]]))

        # PreTokenized inputs

        # Lists
        test_single([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]], True)
        test_pair(
            [
                (["My", "name", "is", "John"], ["pair"]),
                (["My", "name", "is", "Georges"], ["pair"]),
            ],
            True,
        )
        test_pair(
            [
                [["My", "name", "is", "John"], ["pair"]],
                [["My", "name", "is", "Georges"], ["pair"]],
            ],
            True,
        )

        # Tuples
        test_single((("My", "name", "is", "John"), ("My", "name", "is", "Georges")), True)
        test_pair(
            (
                (("My", "name", "is", "John"), ("pair",)),
                (("My", "name", "is", "Georges"), ("pair",)),
            ),
            True,
        )
        test_pair(
            (
                (["My", "name", "is", "John"], ["pair"]),
                (["My", "name", "is", "Georges"], ["pair"]),
            ),
            True,
        )

        # Numpy
        test_single(
            np.array([["My", "name", "is", "John"], ["My", "name", "is", "Georges"]]),
            True,
        )
        test_single(
            np.array((("My", "name", "is", "John"), ("My", "name", "is", "Georges"))),
            True,
        )
        test_pair(
            np.array(
                [
                    [["My", "name", "is", "John"], ["pair"]],
                    [["My", "name", "is", "Georges"], ["pair"]],
                ],
                dtype=object,
            ),
            True,
        )
        test_pair(
            np.array(
                (
                    (("My", "name", "is", "John"), ("pair",)),
                    (("My", "name", "is", "Georges"), ("pair",)),
                ),
                dtype=object,
            ),
            True,
        )

        # Mal formed
        with pytest.raises(TypeError, match="TextInputSequence must be str"):
            tokenizer.encode([["my", "name"]])
        with pytest.raises(TypeError, match="TextInputSequence must be str"):
            tokenizer.encode("My name is john", [["pair"]])
        with pytest.raises(TypeError, match="TextInputSequence must be str"):
            tokenizer.encode("my name is john", ["pair"])

        with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"):
            tokenizer.encode("My name is john", is_pretokenized=True)
        with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"):
            tokenizer.encode("My name is john", ["pair"], is_pretokenized=True)
        with pytest.raises(TypeError, match="InputSequence must be Union[List[str]"):
            tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True)
Example #7
0
    def test_encode_formats(self, bert_files):
        tokenizer = BertWordPieceTokenizer(bert_files["vocab"])

        # Well formed
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
        output = tokenizer.encode("my name is john", "pair")
        assert output.tokens == [
            "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"
        ]
        output = tokenizer.encode(["my", "name", "is", "john"],
                                  is_pretokenized=True)
        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
        output = tokenizer.encode(["my", "name", "is", "john"], ["pair"],
                                  is_pretokenized=True)
        assert output.tokens == [
            "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"
        ]

        output = tokenizer.encode_batch(
            ["My name is John", "My name is Georges"])
        assert output[0].tokens == [
            "[CLS]", "my", "name", "is", "john", "[SEP]"
        ]
        assert output[1].tokens == [
            "[CLS]", "my", "name", "is", "georges", "[SEP]"
        ]
        output = tokenizer.encode_batch([("my name is john", "pair"),
                                         ("my name is john", "pair")])
        assert output[0].tokens == [
            "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"
        ]
        assert output[1].tokens == [
            "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"
        ]
        output = tokenizer.encode_batch([["my", "name", "is", "john"]],
                                        is_pretokenized=True)
        assert output[0].tokens == [
            "[CLS]", "my", "name", "is", "john", "[SEP]"
        ]

        # Mal formed
        with pytest.raises(ValueError, match="InputSequence must be str"):
            tokenizer.encode([["my", "name"]])
            tokenizer.encode("My name is john", [["pair"]])
            tokenizer.encode("my name is john", ["pair"])

        with pytest.raises(ValueError,
                           match="InputSequence must be Union[List[str]"):
            tokenizer.encode("My name is john", is_pretokenized=True)
            tokenizer.encode("My name is john", ["pair"], is_pretokenized=True)
            tokenizer.encode(["My", "name", "is", "John"],
                             "pair",
                             is_pretokenized=True)
Example #8
0
from tokenizers.implementations import ByteLevelBPETokenizer, BertWordPieceTokenizer
from tokenizers.processors import BertProcessing
from transformers import RobertaConfig
from transformers import RobertaTokenizerFast
from transformers import RobertaForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

tokenizer = BertWordPieceTokenizer(vocab_file="./FaBerto/vocab.txt")

# tokenizer._tokenizer.post_processor = BertProcessing(
#     ("</s>", tokenizer.token_to_id("</s>")),
#     ("<s>", tokenizer.token_to_id("<s>")),
# )
tokenizer.enable_truncation(max_length=512)

print(tokenizer.encode("مرکزگرایی یکی از عوامل اصلی خداناباوری است.").tokens)
print(tokenizer.encode("این یک تست است.").ids)

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = RobertaTokenizerFast.from_pretrained("./FaBerto", max_len=512)

model = RobertaForMaskedLM(config=config)
Example #9
0
import tokenizers
from transformers import BertTokenizer
import glob
from tokenizers.implementations import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer()
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=True,
)
files = glob.glob("./corpus_for_tokenization/*.txt")

tokenizer.train(files,
                vocab_size=50000,
                min_frequency=3,
                show_progress=True,
                special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
                limit_alphabet=15000,
                wordpieces_prefix="##")
tokenizer.save_model("./")