Example #1
0
def get_tokenizer(vocab_file):
    tokenizer = BertTokenizerFast(
        vocab_file=vocab_file,
        do_basic_tokenize=True
    )

    special_tokens_dict = {'additional_special_tokens': ["<end>", "<begin>"]}
    tokenizer.add_special_tokens(special_tokens_dict)
    return tokenizer
        files='/opt/ml/code/KBOBERT/KBOBERT_Data.txt',
        vocab_size=32000,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
        wordpieces_prefix="##")

    wp_tokenizer.save_model('./')

    tokenizer = BertTokenizerFast(
        vocab_file="/opt/ml/code/KBOBERT/vocab.txt",
        max_len=512,
        do_lower_case=False,
    )

    tokenizer.add_special_tokens({'mask_token': '[MASK]'})

    # https://huggingface.co/transformers/model_doc/bert.html#bertconfig

    config = BertConfig(vocab_size=32000,
                        hidden_size=256,
                        num_hidden_layers=6,
                        num_attention_heads=4,
                        intermediate_size=3072,
                        hidden_act="gelu",
                        hidden_dropout_prob=0.1,
                        attention_probs_dropout_prob=0.1,
                        max_position_embeddings=512,
                        type_vocab_size=2,
                        pad_token_id=0,
                        position_embedding_type="absolute")