def evaluate_transformers_checkpoint( data_path: str, model_config_path: str, checkpoint_model_name: str, checkpoint_tokenizer_name: str, batch_size: int, cuda_device: int, result_save_path: str, ): """ Expected results for ``test.json`` from the Open Entity dataset: {'micro_precision': 0.7997806072235107, 'micro_recall': 0.7657563090324402, 'micro_fscore': 0.7823987007141113}. Parameters ---------- data_path : str Data path to the input file. model_config_path : str A config file that defines the model architecture to evaluate. checkpoint_model_name : str The name of the checkpoint in Hugging Face Model Hub. checkpoint_tokenizer_name : str This should be the name of the base pre-training model because sometimes the tokenizer of downstream task is not compatible with allennlp. batch_size : int cuda_device : int result_save_path : str """ import_module_and_submodules("examples_allennlp") tokenizer_kwargs = {"additional_special_tokens": [ENT]} reader = EntityTypingReader( tokenizer=PretrainedTransformerTokenizer( model_name=checkpoint_tokenizer_name, add_special_tokens=True, tokenizer_kwargs=tokenizer_kwargs), token_indexers={ "tokens": PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name, tokenizer_kwargs=tokenizer_kwargs) }, use_entity_feature=True, ) transformers_tokenizer = LukeTokenizer.from_pretrained( checkpoint_model_name) transformers_model = LukeForEntityClassification.from_pretrained( checkpoint_model_name) vocab = Vocabulary() vocab.add_transformer_vocab(transformers_tokenizer, "tokens") num_labels = len(transformers_model.config.id2label) labels = [transformers_model.config.id2label[i] for i in range(num_labels)] vocab.add_tokens_to_namespace(labels, namespace="labels") # read model params = Params.from_file( model_config_path, ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name}) model = Model.from_params(params, vocab=vocab) model.classifier = transformers_model.classifier model.eval() # set the GPU device to use if cuda_device < 0: device = torch.device("cpu") else: device = torch.device(f"cuda:{cuda_device}") model = model.to(device) loader = MultiProcessDataLoader(reader, data_path, batch_size=batch_size, shuffle=False) loader.index_with(model.vocab) with torch.no_grad(): for batch in tqdm.tqdm(loader): batch = nn_util.move_to_device(batch, device) output_dict = model(**batch) metrics = model.get_metrics(reset=True) print(metrics) if result_save_path is not None: with open(result_save_path, "w") as f: json.dump(metrics, f)
from typing import Tuple import torch from allennlp.data import Vocabulary, Instance, Token from allennlp.data.fields import TextField from allennlp.data.token_indexers import PretrainedTransformerMismatchedIndexer from allennlp.nn.util import get_token_ids_from_text_field_tensors from transformers import BertTokenizer, DataCollatorForWholeWordMask tokenizer = BertTokenizer.from_pretrained('./bert_out') vocab = Vocabulary(non_padded_namespaces=["tokens"]) vocab.add_transformer_vocab(tokenizer, "tokens") vocab.get_token_index("[PAD]", "tokens") idx = PretrainedTransformerMismatchedIndexer("./bert_out", namespace="tokens") def prepare_instance(s): tokens = [Token(t) for t in s.split(" ")] indexed = idx.tokens_to_indices(tokens, vocab) print([vocab.get_token_from_index(i) for i in indexed['token_ids']]) return Instance({"tokens": TextField(tokens, {"tokens": idx})}) instances = [prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ"), prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ")] for i in instances: i["tokens"].index(vocab) tensors = [i.as_tensor_dict() for i in instances] collator = DataCollatorForWholeWordMask(tokenizer=tokenizer) ids = torch.cat([tensors[0]['tokens']['tokens']['token_ids'].unsqueeze(0),
def evaluate_transformers_checkpoint( data_path: str, model_config_path: str, checkpoint_model_name: str, checkpoint_tokenizer_name: str, batch_size: int, cuda_device: int, result_save_path: str, prediction_save_path: str, ): """ Expected results for CoNLL-2003 NER English test set. {'f1': 0.9461946902654867, 'precision': 0.945859872611465, 'recall': 0.9465297450424929} Parameters ---------- data_path : str Data path to the input file. model_config_path : str A config file that defines the model architecture to evaluate. checkpoint_model_name : str The name of the checkpoint in Hugging Face Model Hub. checkpoint_tokenizer_name : str This should be the name of the base pre-training model because sometimes the tokenizer of downstream task is not compatible with allennlp. batch_size : int cuda_device : int result_save_path : str """ import_module_and_submodules("examples_allennlp") reader = ConllSpanReader( tokenizer=PretrainedTransformerTokenizer( model_name=checkpoint_tokenizer_name, add_special_tokens=False, tokenizer_kwargs={"add_prefix_space": True}), token_indexers={ "tokens": PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name) }, use_entity_feature=True, ) transformers_tokenizer = LukeTokenizer.from_pretrained( checkpoint_model_name) transformers_model = LukeForEntitySpanClassification.from_pretrained( checkpoint_model_name) vocab = Vocabulary() vocab.add_transformer_vocab(transformers_tokenizer, "tokens") num_labels = len(transformers_model.config.id2label) labels = [transformers_model.config.id2label[i] for i in range(num_labels)] labels = ["O" if l == "NIL" else l for l in labels] vocab.add_tokens_to_namespace(labels, namespace="labels") # read model params = Params.from_file( model_config_path, ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name}) if prediction_save_path is not None: params["prediction_save_path"] = prediction_save_path model = Model.from_params(params, vocab=vocab) model.classifier = transformers_model.classifier model.eval() # set the GPU device to use if cuda_device < 0: device = torch.device("cpu") else: device = torch.device(f"cuda:{cuda_device}") model = model.to(device) loader = MultiProcessDataLoader(reader, data_path, batch_size=batch_size, shuffle=False) loader.index_with(model.vocab) with torch.no_grad(): for batch in tqdm.tqdm(loader): batch = nn_util.move_to_device(batch, device) output_dict = model(**batch) metrics = model.get_metrics(reset=True) print(metrics) if result_save_path is not None: with open(result_save_path, "w") as f: json.dump(metrics, f)