Beispiel #1
0
print("Single segment token (int): {}".format(single_seg_input['input_ids']))
print("Single segment type       : {}".format(
    single_seg_input['token_type_ids']))

# Segments are concatened in the input to the model, with
print()
print("Multi segment token (str): {}".format(
    tokenizer.convert_ids_to_tokens(multi_seg_input['input_ids'])))
print("Multi segment token (int): {}".format(multi_seg_input['input_ids']))
print("Multi segment type       : {}".format(
    multi_seg_input['token_type_ids']))

# Padding highlight
tokens = tokenizer.batch_encode_plus(
    ["This is a sample", "This is another longer sample text"],
    pad_to_max_length=
    True  # First sentence will have some PADDED tokens to match second sequence length
)

for i in range(2):
    print("Tokens (int)      : {}".format(tokens['input_ids'][i]))
    print("Tokens (str)      : {}".format(
        [tokenizer.convert_ids_to_tokens(s) for s in tokens['input_ids'][i]]))
    print("Tokens (attn_mask): {}".format(tokens['attention_mask'][i]))
    print()

from transformers import TFBertModel, BertModel

# Let's load a BERT model for TensorFlow and PyTorch
model_tf = TFBertModel.from_pretrained('bert-base-cased')
model_pt = BertModel.from_pretrained('bert-base-cased')