def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, output_file: str = None) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) with ExitStack() as stack: if output_file is None: file_handle = None else: file_handle = stack.enter_context(open(output_file, 'w')) for batch in generator_tqdm: model_output = model(**batch) metrics = model.get_metrics() if file_handle: id2label = model.vocab.get_index_to_token_vocabulary("labels") _persist_data(file_handle, batch.get("metadata"), model_output, id2label=id2label) description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description) return model.get_metrics()
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: batch = util.move_to_device(batch, cuda_device) model(**batch) metrics = model.get_metrics() if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: batch = util.move_to_device(batch, cuda_device) model(**batch) metrics = model.get_metrics() if (not _warned_tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join([ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, output_file: str = None, eval_type: str = None) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) with ExitStack() as stack: if output_file is None: file_handle = None else: file_handle = stack.enter_context(open(output_file, 'w')) for batch in generator_tqdm: ## made cuda compatible (if needed) batch = move_to_device(batch, cuda_device) model_output = model(**batch) metrics = model.get_metrics() if file_handle: _persist_data(file_handle, batch.get("metadata"), model_output, eval_type) description = ', '.join([ "%s: %.2f" % (name, value) for name, value in metrics.items() ]) + " ||" generator_tqdm.set_description(description) return model.get_metrics(reset=True)
def evaluate(model: Model, instances: Iterable[Instance], task_name: str, data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: """ Evaluate a model for a particular tasks (usually after training). Parameters ---------- model : ``allennlp.models.model.Model``, required The model to evaluate instances : ``Iterable[Instance]``, required The (usually test) dataset on which to evalute the model. task_name : ``str``, required The name of the tasks on which evaluate the model. data_iterator : ``DataIterator`` Iterator that go through the dataset. cuda_device : ``int`` Cuda device to use. Returns ------- metrics : ``Dict[str, Any]`` A dictionary containing the metrics on the evaluated dataset. """ check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) eval_loss = 0 nb_batches = 0 for tensor_batch in generator_tqdm: nb_batches += 1 train_stages = ["stm", "sd", "valid"] task_index = TASKS_NAME.index(task_name) tensor_batch['task_index'] = torch.tensor(task_index) tensor_batch["reverse"] = torch.tensor(False) tensor_batch['for_training'] = torch.tensor(False) train_stage = train_stages.index("stm") tensor_batch['train_stage'] = torch.tensor(train_stage) tensor_batch = move_to_device(tensor_batch, 0) eval_output_dict = model.forward(**tensor_batch) loss = eval_output_dict["loss"] eval_loss += loss.item() metrics = model.get_metrics(task_name=task_name) metrics["stm_loss"] = float(eval_loss / nb_batches) description = training_util.description_from_metrics(metrics) generator_tqdm.set_description(description, refresh=False) metrics = model.get_metrics(task_name=task_name, reset=True) metrics["stm_loss"] = float(eval_loss / nb_batches) return metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, label_fname: str) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() label_file = open(label_fname, 'w') label_file.write('real_label,guessed_label\n') iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) total_num_inst = 0 for batch in generator_tqdm: num_inst = batch['tokens']['tokens'].size(0) total_num_inst += num_inst batch = util.move_to_device(batch, cuda_device) output_dict = model(**batch) if cuda_device == -1: output_matrix = output_dict['label_logits'].data.numpy() else: output_matrix = output_dict['label_logits'].data.cpu().numpy() output_labels = np.argmax(output_matrix, axis=1) if cuda_device == -1: true_labels = batch['label'].data.numpy() else: true_labels = batch['label'].data.cpu().numpy() assert true_labels.shape[0] == output_labels.shape[0] for i in range(true_labels.shape[0]): label_file.write(str(int(true_labels[i])) + ',') label_file.write(str(int(output_labels[i])) + '\n') metrics = model.get_metrics() if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) print("NUM INSTANCES ITERATED OVER: " + str(total_num_inst)) label_file.close() return model.get_metrics(reset=True)
def evaluate(model: Model, instances: Iterable[Instance], task_name: str, data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: """ Evaluate a model for a particular task (usually after training). Parameters ---------- model : ``allennlp.models.model.Model``, required The model to evaluate instances : ``Iterable[Instance]``, required The (usually test) dataset on which to evalute the model. task_name : ``str``, required The name of the task on which evaluate the model. data_iterator : ``DataIterator`` Iterator that go through the dataset. cuda_device : ``int`` Cuda device to use. Returns ------- metrics : ``Dict[str, Any]`` A dictionary containing the metrics on the evaluated dataset. """ check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) eval_loss = 0 nb_batches = 0 for batch in generator_tqdm: batch = util.move_to_device(batch, cuda_device) nb_batches += 1 eval_output_dict = model.forward(task_name=task_name, tensor_batch=batch) loss = eval_output_dict["loss"] eval_loss += loss.item() metrics = model.get_metrics(task_name=task_name) metrics["loss"] = float(eval_loss / nb_batches) description = ", ".join([ "%s: %.2f" % (name, value) for name, value in metrics.items() ]) + " ||" generator_tqdm.set_description(description, refresh=False) metrics = model.get_metrics(task_name=task_name, reset=True, full=True) metrics["loss"] = float(eval_loss / nb_batches) return metrics
def evaluate(model: Model, dataset: Dataset, iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() generator = iterator(dataset, num_epochs=1) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset)) output = pd.DataFrame() for raw_batch, batch in generator_tqdm: raw_fields = [x.fields for x in raw_batch.instances] parsed_fields = [] for item in raw_fields: premise = " ".join([x.text for x in item['premise'].tokens]) hypothesis = " ".join([x.text for x in item['hypothesis'].tokens]) label = item['label'].label parsed_fields.append({ "sentence1": premise, "sentence2": hypothesis, "gold_label": label }) parsed_fields = pd.DataFrame(parsed_fields) tensor_batch = arrays_to_variables(batch, cuda_device, for_training=False) bo = model.forward(**tensor_batch) metrics = model.get_metrics() description = ', '.join( ["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description) batch_output = pd.DataFrame() INVERSE_LABEL_MAP = { 0: "entailment", 1: "neutral", 2: "contradiction", 3: "hidden" } batch_output['prediction_label'] = bo['label_logits'].data.numpy( ).argmax(axis=1) batch_output['prediction_score'] = bo['label_probs'].data.numpy().max( axis=1) batch_output['prediction_label'] = batch_output.prediction_label.apply( lambda x: INVERSE_LABEL_MAP[x]) parsed_output = pd.concat([parsed_fields, batch_output], axis=1) output = pd.concat([output, parsed_output], axis=0) hard_subset = output.loc[output.gold_label != output.prediction_label] easy_subset = output.loc[output.gold_label == output.prediction_label] return model.get_metrics(), hard_subset, easy_subset
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) batch_count = 0 loss_count = 0 total_loss = 0.0 for batch in generator_tqdm: batch_count += 1 batch = util.move_to_device(batch, cuda_device) loss = model(**batch).get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 metrics["loss"] = loss.item() total_loss += loss.item() if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: if loss_count != batch_count: raise RuntimeError("The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss/batch_count return final_metrics
def get_metrics( model: Model, total_loss: float, total_reg_loss: Optional[float], batch_loss: Optional[float], batch_reg_loss: Optional[float], num_batches: int, reset: bool = False, world_size: int = 1, cuda_device: Union[int, torch.device] = torch.device("cpu"), ) -> Dict[str, float]: """ Gets the metrics but sets `"loss"` to the total loss divided by the `num_batches` so that the `"loss"` metric is "average loss per batch". Returns the `"batch_loss"` separately. """ metrics = model.get_metrics(reset=reset) if batch_loss is not None: metrics["batch_loss"] = batch_loss metrics["loss"] = float(total_loss / num_batches) if num_batches > 0 else 0.0 if total_reg_loss is not None: if batch_reg_loss is not None: metrics["batch_reg_loss"] = batch_reg_loss metrics["reg_loss"] = float(total_reg_loss / num_batches) if num_batches > 0 else 0.0 return metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics()
def get_metrics( model: Model, total_loss: float, num_batches: int, reset: bool = False, world_size: int = 1, cuda_device: Union[int, List] = 0, ) -> Dict[str, float]: """ Gets the metrics but sets `"loss"` to the total loss divided by the `num_batches` so that the `"loss"` metric is "average loss per batch". """ metrics = model.get_metrics(reset=reset) metrics["loss"] = float(total_loss / num_batches) if num_batches > 0 else 0.0 if world_size > 1: # In distributed mode, average out all metrics across GPUs aggregated_metrics = {} for metric_name, metric_val in metrics.items(): if isinstance(cuda_device, list): metric_tensor = torch.tensor(metric_val).to(torch.device(cuda_device[0])) else: metric_tensor = torch.tensor(metric_val).to(torch.device(cuda_device)) dist.all_reduce(metric_tensor, op=dist.ReduceOp.SUM) reduced_metric = metric_tensor.item() / world_size aggregated_metrics[metric_name] = reduced_metric return aggregated_metrics else: return metrics
def evaluate(model: Model, dataset: Dataset, iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() generator = iterator(dataset, num_epochs=1) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset)) for batch in generator_tqdm: tensor_batch = arrays_to_variables(batch, cuda_device, for_training=False) model.forward(**tensor_batch) metrics = model.get_metrics() description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description) return model.get_metrics()
def get_metrics(model: Model, total_loss: float, num_batches: int, reset: bool = False) -> Dict[str, float]: """ Gets the metrics but sets ``"loss"`` to the total loss divided by the ``num_batches`` so that the ``"loss"`` metric is "average loss per batch". """ metrics = model.get_metrics(reset=reset) metrics["loss"] = float(total_loss / num_batches) if num_batches > 0 else 0.0 return metrics
def evaluate(model: Model, dataset: InstanceCollection, iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() generator = iterator(dataset, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join( ["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description) return model.get_metrics()
def evaluate(model: Model, dataset: Dataset, iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() generator = iterator(dataset, num_epochs=1) logger.info("Iterating over dataset") for batch in tqdm.tqdm(generator): tensor_batch = arrays_to_variables(batch, cuda_device, for_training=False) model.forward(**tensor_batch) return model.get_metrics()
def get_model_predictions(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> (Dict[str, Any], List): model.eval() model_predictions = [] iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) return model.get_metrics(), model_predictions
def get_metrics( model: Model, total_loss: float, total_reg_loss: Optional[float], batch_loss: Optional[float], batch_reg_loss: Optional[float], num_batches: int, reset: bool = False, world_size: int = 1, cuda_device: Union[int, torch.device] = torch.device("cpu"), ) -> Dict[str, float]: """ Gets the metrics but sets `"loss"` to the total loss divided by the `num_batches` so that the `"loss"` metric is "average loss per batch". Returns the `"batch_loss"` separately. """ metrics = model.get_metrics(reset=reset) if batch_loss is not None: metrics["batch_loss"] = batch_loss metrics["loss"] = float(total_loss / num_batches) if num_batches > 0 else 0.0 if total_reg_loss is not None: if batch_reg_loss is not None: metrics["batch_reg_loss"] = batch_reg_loss metrics["reg_loss"] = float(total_reg_loss / num_batches) if num_batches > 0 else 0.0 if world_size > 1: # In distributed mode, average out all metrics across GPUs aggregated_metrics = {} for metric_name, metric_val in metrics.items(): metric_tensor = torch.tensor(metric_val).to(cuda_device) dist.all_reduce(metric_tensor, op=dist.ReduceOp.SUM) reduced_metric = metric_tensor.item() / world_size aggregated_metrics[metric_name] = reduced_metric return aggregated_metrics else: return metrics
def evaluate( model: Model, data_loader: DataLoader, cuda_device: int, batch_weight_key: str, ) -> Dict[str, Any]: check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = iter(data_loader) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=len(data_loader)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics ): logger.warning( 'Metrics with names beginning with "_" will ' "not be logged to the tqdm progress bar." ) HasBeenWarned.tqdm_ignores_underscores = True description = ( ", ".join( [ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ] ) + " ||" ) generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes " + "produced a loss!" ) final_metrics["loss"] = total_loss / total_weight return final_metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") ############ Comment out this block to save class_probabilities, logits, and losses for each batch ######### # print(output_dict['class_probabilities'].shape) # import copy # # newoutput_dict = copy.deepcopy(output_dict) # newoutput_dict['class_probabilities'] = newoutput_dict['class_probabilities'].cpu().data.numpy() # newoutput_dict['logits'] = newoutput_dict['logits'].cpu().data.numpy() # newoutput_dict['loss'] = newoutput_dict['loss'].cpu().data.numpy() # # output_file = os.path.join(os.path.dirname(__file__), '..', "data", "test", # str(batch_count) + "_output.pkl") # import json # import pickle # if output_file: # with open(output_file, "wb") as file: # pickle.dump(newoutput_dict, file) # file.close() # ########################################################################################################### metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") HasBeenWarned.tqdm_ignores_underscores = True description = ', '.join([ "%s: %.4f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight return final_metrics
def evaluate(model: Model, dataset: Dataset, iterator: BasicIterator, cuda_device: int, serialization_directory: str) -> Dict[str, Any]: model.eval() generator = iterator(dataset, num_epochs=1, cuda_device=cuda_device, shuffle=False, for_training=False) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join([ "%s: %.5f" % (name, value) for name, value in metrics.items() if "overall" in name ]) + " ||" generator_tqdm.set_description(description) metrics = model.get_metrics() golds = metrics["gold_spans"] predictions = metrics["predicted_spans"] assert len(dataset.instances) == len(golds) == len(predictions) # gold_file_path = os.path.join(serialization_directory, "gold.txt") prediction_file_path = os.path.join(serialization_directory, "predictions.txt") prediction_file = open(prediction_file_path, "w+") # gold_file = open(gold_file_path, "w+") logger.info("Writing predictions in CoNLL-like format to %s", prediction_file_path) for instance, gold, prediction in tqdm.tqdm( zip(dataset.instances, golds, predictions)): fields = instance.fields if "targets" in fields: verb_index = fields["targets"].labels.index(1) elif "verb_indicator" in fields: try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None else: verb_index = None frame = None if "frame" in fields: frame = fields["frame"].tokens[0].text gf = None if "gf" in fields: gf = [g.text for g in fields["gf"].tokens] pt = None if "pt" in fields: pt = [p.text for p in fields["pt"].tokens] sentence = [token.text for token in fields["tokens"].tokens] gold_tags = convert_spans_to_seq(gold, len(sentence)) predicted_tags = convert_spans_to_seq(prediction, len(sentence)) assert len(sentence) == len(gold_tags) == len(predicted_tags) write_to_conll_eval_file( prediction_file, # gold_file, verb_index, sentence, predicted_tags, gold_tags, frame, gf, pt) return model.get_metrics()
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 # ksk total_probs, all_example_ids = [], [] for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight # ksk if 'probs' in output_dict: total_probs.extend(output_dict['probs']) all_example_ids.extend([ batch['metadata'][batch_index]['example_ids'] for batch_index in range(len(batch['metadata'])) ]) if (not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") HasBeenWarned.tqdm_ignores_underscores = True description = ', '.join([ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight # ksk if 'probs' in output_dict: total_probs.extend(output_dict['probs']) all_example_ids.extend([ batch['metadata'][batch_index]['example_ids'] for batch_index in range(len(batch['metadata'])) ]) final_metrics["probs"] = total_probs final_metrics["example_ids"] = all_example_ids return final_metrics
def evaluate( model: Model, data_loader: DataLoader, cuda_device: int = -1, batch_weight_key: str = None, ) -> Dict[str, Any]: """ # Parameters model : `Model` The model to evaluate data_loader : `DataLoader` The `DataLoader` that will iterate over the evaluation data (data loaders already contain their data). cuda_device : `int`, optional (default=`-1`) The cuda device to use for this evaluation. The model is assumed to already be using this device; this parameter is only used for moving the input data to the correct device. batch_weight_key : `str`, optional (default=`None`) If given, this is a key in the output dictionary for each batch that specifies how to weight the loss for that batch. If this is not given, we use a weight of 1 for every batch. """ check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = iter(data_loader) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics ): logger.warning( 'Metrics with names beginning with "_" will ' "not be logged to the tqdm progress bar." ) HasBeenWarned.tqdm_ignores_underscores = True description = ( ", ".join( [ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ] ) + " ||" ) generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes " + "produced a loss!" ) final_metrics["loss"] = total_loss / total_weight return final_metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError("The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight return final_metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 runtime = [] prev_time = time.time() for batch in generator_tqdm: batch_count += 1 #if batch_count == 1000: # runtime = np.asarray(runtime) # print("Mean:", np.mean(runtime)) # print("Std:", np.std(runtime)) # exit(-1) batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not HasBeenWarned.tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") HasBeenWarned.tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) runtime.append(time.time() - prev_time) prev_time = time.time() final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError("The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight return final_metrics