def test_inference_large_model(self): model = LukeModel.from_pretrained("studio-ousia/luke-large").eval() model.to(torch_device) tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large", task="entity_classification") text = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ." span = (39, 42) encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt") # move all values to device for key, value in encoding.items(): encoding[key] = encoding[key].to(torch_device) outputs = model(**encoding) # Verify word hidden states expected_shape = torch.Size((1, 42, 1024)) self.assertEqual(outputs.last_hidden_state.shape, expected_shape) expected_slice = torch.tensor( [[0.0133, 0.0865, 0.0095], [0.3093, -0.2576, -0.7418], [-0.1720, -0.2117, -0.2869]] ).to(torch_device) self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)) # Verify entity hidden states expected_shape = torch.Size((1, 1, 1024)) self.assertEqual(outputs.entity_last_hidden_state.shape, expected_shape) expected_slice = torch.tensor([[0.0466, -0.0106, -0.0179]]).to(torch_device) self.assertTrue(torch.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
def create_and_check_model( self, config, input_ids, attention_mask, token_type_ids, entity_ids, entity_attention_mask, entity_token_type_ids, entity_position_ids, sequence_labels, token_labels, choice_labels, entity_labels, entity_classification_labels, entity_pair_classification_labels, entity_span_classification_labels, ): model = LukeModel(config=config) model.to(torch_device) model.eval() # test with words + entities result = model( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, entity_ids=entity_ids, entity_attention_mask=entity_attention_mask, entity_token_type_ids=entity_token_type_ids, entity_position_ids=entity_position_ids, ) self.parent.assertEqual( result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) self.parent.assertEqual( result.entity_last_hidden_state.shape, (self.batch_size, self.entity_length, self.hidden_size)) # test with words only result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) self.parent.assertEqual( result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def test_model_from_pretrained(self): for model_name in LUKE_PRETRAINED_MODEL_ARCHIVE_LIST: model = LukeModel.from_pretrained(model_name) self.assertIsNotNone(model)
def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path, pytorch_dump_folder_path, model_size): # Load configuration defined in the metadata file with open(metadata_path) as metadata_file: metadata = json.load(metadata_file) config = LukeConfig(use_entity_aware_attention=True, **metadata["model_config"]) # Load in the weights from the checkpoint_path state_dict = torch.load(checkpoint_path, map_location="cpu") # Load the entity vocab file entity_vocab = load_entity_vocab(entity_vocab_path) tokenizer = RobertaTokenizer.from_pretrained( metadata["model_config"]["bert_model_name"]) # Add special tokens to the token vocabulary for downstream tasks entity_token_1 = AddedToken("<ent>", lstrip=False, rstrip=False) entity_token_2 = AddedToken("<ent2>", lstrip=False, rstrip=False) tokenizer.add_special_tokens( dict(additional_special_tokens=[entity_token_1, entity_token_2])) config.vocab_size += 2 print(f"Saving tokenizer to {pytorch_dump_folder_path}") tokenizer.save_pretrained(pytorch_dump_folder_path) with open( os.path.join(pytorch_dump_folder_path, LukeTokenizer.vocab_files_names["entity_vocab_file"]), "w") as f: json.dump(entity_vocab, f) tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path) # Initialize the embeddings of the special tokens word_emb = state_dict["embeddings.word_embeddings.weight"] ent_emb = word_emb[tokenizer.convert_tokens_to_ids(["@"])[0]].unsqueeze(0) ent2_emb = word_emb[tokenizer.convert_tokens_to_ids(["#"])[0]].unsqueeze(0) state_dict["embeddings.word_embeddings.weight"] = torch.cat( [word_emb, ent_emb, ent2_emb]) # Initialize the query layers of the entity-aware self-attention mechanism for layer_index in range(config.num_hidden_layers): for matrix_name in ["query.weight", "query.bias"]: prefix = f"encoder.layer.{layer_index}.attention.self." state_dict[prefix + "w2e_" + matrix_name] = state_dict[prefix + matrix_name] state_dict[prefix + "e2w_" + matrix_name] = state_dict[prefix + matrix_name] state_dict[prefix + "e2e_" + matrix_name] = state_dict[prefix + matrix_name] # Initialize the embedding of the [MASK2] entity using that of the [MASK] entity for downstream tasks entity_emb = state_dict["entity_embeddings.entity_embeddings.weight"] entity_emb[entity_vocab["[MASK2]"]] = entity_emb[entity_vocab["[MASK]"]] model = LukeModel(config=config).eval() missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) if not (len(missing_keys) == 1 and missing_keys[0] == "embeddings.position_ids"): raise ValueError( f"Missing keys {', '.join(missing_keys)}. Expected only missing embeddings.position_ids" ) if not (all( key.startswith("entity_predictions") or key.startswith("lm_head") for key in unexpected_keys)): raise ValueError( f"Unexpected keys {', '.join([key for key in unexpected_keys if not (key.startswith('entity_predictions') or key.startswith('lm_head'))])}" ) # Check outputs tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path, task="entity_classification") text = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ." span = (39, 42) encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt") outputs = model(**encoding) # Verify word hidden states if model_size == "large": expected_shape = torch.Size((1, 42, 1024)) expected_slice = torch.tensor([[0.0133, 0.0865, 0.0095], [0.3093, -0.2576, -0.7418], [-0.1720, -0.2117, -0.2869]]) else: # base expected_shape = torch.Size((1, 42, 768)) expected_slice = torch.tensor([[0.0037, 0.1368, -0.0091], [0.1099, 0.3329, -0.1095], [0.0765, 0.5335, 0.1179]]) if not (outputs.last_hidden_state.shape == expected_shape): raise ValueError( f"Outputs.last_hidden_state.shape is {outputs.last_hidden_state.shape}, Expected shape is {expected_shape}" ) if not torch.allclose( outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4): raise ValueError # Verify entity hidden states if model_size == "large": expected_shape = torch.Size((1, 1, 1024)) expected_slice = torch.tensor([[0.0466, -0.0106, -0.0179]]) else: # base expected_shape = torch.Size((1, 1, 768)) expected_slice = torch.tensor([[0.1457, 0.1044, 0.0174]]) if not (outputs.entity_last_hidden_state.shape != expected_shape): raise ValueError( f"Outputs.entity_last_hidden_state.shape is {outputs.entity_last_hidden_state.shape}, Expected shape is {expected_shape}" ) if not torch.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4): raise ValueError # Finally, save our PyTorch model and tokenizer print("Saving PyTorch model to {}".format(pytorch_dump_folder_path)) model.save_pretrained(pytorch_dump_folder_path)
# @author Loreto Parisi (loretoparisi at gmail dot com) # Copyright (c) 2021 Loreto Parisi (loretoparisi at gmail dot com) import os from transformers import LukeTokenizer, LukeModel, LukeForEntityPairClassification, LukeForEntitySpanClassification ### # LUKE models in the following examples will be saved to cache_dir=../../models # # studio-ousia/luke-base # studio-ousia/luke-large-finetuned-tacred # studio-ousia/luke-large-finetuned-conll-2003 ### model = LukeModel.from_pretrained("studio-ousia/luke-base", cache_dir=os.getenv("cache_dir", "../../models")) tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base", cache_dir=os.getenv( "cache_dir", "../../models")) # Example 1: Computing the contextualized entity representation corresponding to the entity mention "Beyoncé" text = "Beyoncé lives in Los Angeles." entity_spans = [(0, 7) ] # character-based entity span corresponding to "Beyoncé" inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt") outputs = model(**inputs) word_last_hidden_state = outputs.last_hidden_state