Beispiel #1
0
# We need to create the model and tokenizer
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokens comes from a process that splits the input into sub-entities with interesting linguistic properties.
tokens = tokenizer.tokenize("This is an input example")
print("Tokens: {}".format(tokens))

# This is not sufficient for the model, as it requires integers as input,
# not a problem, let's convert tokens to ids.
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens id: {}".format(tokens_ids))

# Add the required special tokens
tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)

# We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.
tokens_pt = torch.tensor([tokens_ids])
print("Tokens PyTorch: {}".format(tokens_pt))

# Now we're ready to go through BERT with out input
outputs, pooled = model(tokens_pt)
print("Token wise output: {}, Pooled output: {}".format(
    outputs.shape, pooled.shape))

# Same thing factored into one-line as follow
tokens_pt2 = tokenizer.encode_plus("This is an input example",
                                   return_tensors="pt")

for key, value in tokens_pt2.items():