def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, output_file: str = None) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) with ExitStack() as stack: if output_file is None: file_handle = None else: file_handle = stack.enter_context(open(output_file, 'w')) for batch in generator_tqdm: model_output = model(**batch) metrics = model.get_metrics() if file_handle: id2label = model.vocab.get_index_to_token_vocabulary("labels") _persist_data(file_handle, batch.get("metadata"), model_output, id2label=id2label) description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description) return model.get_metrics()
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: batch = util.move_to_device(batch, cuda_device) model(**batch) metrics = model.get_metrics() if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: batch = util.move_to_device(batch, cuda_device) model(**batch) metrics = model.get_metrics() if (not _warned_tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join([ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def ensure_model_can_train_save_and_load(self, model: Model, dataset: Dataset, iterator: DataIterator = None): model.eval() # set eval mode, to turn off things like dropout data_iterator = iterator or BasicIterator() single_batch = next(data_iterator(dataset)) single_batch = arrays_to_variables(single_batch) model_predictions = model.forward(**single_batch) # Check loss exists and we can compute gradients. model_loss = model_predictions["loss"] assert model_loss is not None model_loss.backward() torch.save(model.state_dict(), self.MODEL_FILE) loaded_model = model loaded_model.zero_grad() loaded_model.load_state_dict(torch.load(self.MODEL_FILE)) loaded_model.eval() # set eval mode, to turn off things like dropout loaded_model_predictions = loaded_model.forward(**single_batch) # Check loaded model's loss exists and we can compute gradients. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values # for these keys should be close. for key in model_predictions.keys(): assert_allclose(model_predictions[key].data.numpy(), loaded_model_predictions[key].data.numpy()) return model, loaded_model
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, output_file: str = None, eval_type: str = None) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) with ExitStack() as stack: if output_file is None: file_handle = None else: file_handle = stack.enter_context(open(output_file, 'w')) for batch in generator_tqdm: ## made cuda compatible (if needed) batch = move_to_device(batch, cuda_device) model_output = model(**batch) metrics = model.get_metrics() if file_handle: _persist_data(file_handle, batch.get("metadata"), model_output, eval_type) description = ', '.join([ "%s: %.2f" % (name, value) for name, value in metrics.items() ]) + " ||" generator_tqdm.set_description(description) return model.get_metrics(reset=True)
def evaluate(model: Model, instances: Iterable[Instance], task_name: str, data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: """ Evaluate a model for a particular tasks (usually after training). Parameters ---------- model : ``allennlp.models.model.Model``, required The model to evaluate instances : ``Iterable[Instance]``, required The (usually test) dataset on which to evalute the model. task_name : ``str``, required The name of the tasks on which evaluate the model. data_iterator : ``DataIterator`` Iterator that go through the dataset. cuda_device : ``int`` Cuda device to use. Returns ------- metrics : ``Dict[str, Any]`` A dictionary containing the metrics on the evaluated dataset. """ check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) eval_loss = 0 nb_batches = 0 for tensor_batch in generator_tqdm: nb_batches += 1 train_stages = ["stm", "sd", "valid"] task_index = TASKS_NAME.index(task_name) tensor_batch['task_index'] = torch.tensor(task_index) tensor_batch["reverse"] = torch.tensor(False) tensor_batch['for_training'] = torch.tensor(False) train_stage = train_stages.index("stm") tensor_batch['train_stage'] = torch.tensor(train_stage) tensor_batch = move_to_device(tensor_batch, 0) eval_output_dict = model.forward(**tensor_batch) loss = eval_output_dict["loss"] eval_loss += loss.item() metrics = model.get_metrics(task_name=task_name) metrics["stm_loss"] = float(eval_loss / nb_batches) description = training_util.description_from_metrics(metrics) generator_tqdm.set_description(description, refresh=False) metrics = model.get_metrics(task_name=task_name, reset=True) metrics["stm_loss"] = float(eval_loss / nb_batches) return metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, label_fname: str) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() label_file = open(label_fname, 'w') label_file.write('real_label,guessed_label\n') iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) total_num_inst = 0 for batch in generator_tqdm: num_inst = batch['tokens']['tokens'].size(0) total_num_inst += num_inst batch = util.move_to_device(batch, cuda_device) output_dict = model(**batch) if cuda_device == -1: output_matrix = output_dict['label_logits'].data.numpy() else: output_matrix = output_dict['label_logits'].data.cpu().numpy() output_labels = np.argmax(output_matrix, axis=1) if cuda_device == -1: true_labels = batch['label'].data.numpy() else: true_labels = batch['label'].data.cpu().numpy() assert true_labels.shape[0] == output_labels.shape[0] for i in range(true_labels.shape[0]): label_file.write(str(int(true_labels[i])) + ',') label_file.write(str(int(output_labels[i])) + '\n') metrics = model.get_metrics() if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) print("NUM INSTANCES ITERATED OVER: " + str(total_num_inst)) label_file.close() return model.get_metrics(reset=True)
def evaluate(model: Model, instances: Iterable[Instance], task_name: str, data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: """ Evaluate a model for a particular task (usually after training). Parameters ---------- model : ``allennlp.models.model.Model``, required The model to evaluate instances : ``Iterable[Instance]``, required The (usually test) dataset on which to evalute the model. task_name : ``str``, required The name of the task on which evaluate the model. data_iterator : ``DataIterator`` Iterator that go through the dataset. cuda_device : ``int`` Cuda device to use. Returns ------- metrics : ``Dict[str, Any]`` A dictionary containing the metrics on the evaluated dataset. """ check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) eval_loss = 0 nb_batches = 0 for batch in generator_tqdm: batch = util.move_to_device(batch, cuda_device) nb_batches += 1 eval_output_dict = model.forward(task_name=task_name, tensor_batch=batch) loss = eval_output_dict["loss"] eval_loss += loss.item() metrics = model.get_metrics(task_name=task_name) metrics["loss"] = float(eval_loss / nb_batches) description = ", ".join([ "%s: %.2f" % (name, value) for name, value in metrics.items() ]) + " ||" generator_tqdm.set_description(description, refresh=False) metrics = model.get_metrics(task_name=task_name, reset=True, full=True) metrics["loss"] = float(eval_loss / nb_batches) return metrics
def evaluate(model: Model, dataset: Dataset, iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() generator = iterator(dataset, num_epochs=1) logger.info("Iterating over dataset") for batch in tqdm.tqdm(generator): tensor_batch = arrays_to_variables(batch, cuda_device, for_training=False) model.forward(**tensor_batch) return model.get_metrics()
def evaluate(model: Model, dataset: Dataset, iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() generator = iterator(dataset, num_epochs=1) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset)) output = pd.DataFrame() for raw_batch, batch in generator_tqdm: raw_fields = [x.fields for x in raw_batch.instances] parsed_fields = [] for item in raw_fields: premise = " ".join([x.text for x in item['premise'].tokens]) hypothesis = " ".join([x.text for x in item['hypothesis'].tokens]) label = item['label'].label parsed_fields.append({ "sentence1": premise, "sentence2": hypothesis, "gold_label": label }) parsed_fields = pd.DataFrame(parsed_fields) tensor_batch = arrays_to_variables(batch, cuda_device, for_training=False) bo = model.forward(**tensor_batch) metrics = model.get_metrics() description = ', '.join( ["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description) batch_output = pd.DataFrame() INVERSE_LABEL_MAP = { 0: "entailment", 1: "neutral", 2: "contradiction", 3: "hidden" } batch_output['prediction_label'] = bo['label_logits'].data.numpy( ).argmax(axis=1) batch_output['prediction_score'] = bo['label_probs'].data.numpy().max( axis=1) batch_output['prediction_label'] = batch_output.prediction_label.apply( lambda x: INVERSE_LABEL_MAP[x]) parsed_output = pd.concat([parsed_fields, batch_output], axis=1) output = pd.concat([output, parsed_output], axis=0) hard_subset = output.loc[output.gold_label != output.prediction_label] easy_subset = output.loc[output.gold_label == output.prediction_label] return model.get_metrics(), hard_subset, easy_subset
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) batch_count = 0 loss_count = 0 total_loss = 0.0 for batch in generator_tqdm: batch_count += 1 batch = util.move_to_device(batch, cuda_device) loss = model(**batch).get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 metrics["loss"] = loss.item() total_loss += loss.item() if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: if loss_count != batch_count: raise RuntimeError("The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss/batch_count return final_metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics()
def evaluate(model: Model, dataset: Dataset, iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() generator = iterator(dataset, num_epochs=1) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset)) for batch in generator_tqdm: tensor_batch = arrays_to_variables(batch, cuda_device, for_training=False) model.forward(**tensor_batch) metrics = model.get_metrics() description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description) return model.get_metrics()
def get_iter_norm_mean_eval( model: Model, data_loader: DataLoader, mean: torch.Tensor, cuda_device: int = -1 ) -> Dict[str, Any]: """ # Parameters model : `Model` The model to evaluate data_loader : `DataLoader` The `DataLoader` that will iterate over the evaluation data (data loaders already contain their data). cuda_device : `int`, optional (default=`-1`) The cuda device to use for this evaluation. The model is assumed to already be using this device; this parameter is only used for moving the input data to the correct device. batch_weight_key : `str`, optional (default=`None`) If given, this is a key in the output dictionary for each batch that specifies how to weight the loss for that batch. If this is not given, we use a weight of 1 for every batch. """ check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = iter(data_loader) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator) # mean_embeddings: [torch.Tensor, int] # mean_embeddings = [torch.tensor([0.], device=cuda_device), 0] embeddings = [] for batch in generator_tqdm: batch = nn_util.move_to_device(batch, cuda_device) batch_embeddings = model.forward_embeddings(batch['words'], mean) # mean_embeddings[0] = (mean_embeddings[0] + batch_embeddings.sum(dim=0)) # mean_embeddings[1] += batch_embeddings.shape[0] embeddings.append(batch_embeddings) # mean_embeddings[0] = mean_embeddings[0] / mean_embeddings[1] embeddings = torch.cat(embeddings, dim=0) return embeddings.mean(dim=0), embeddings # mean_embeddings[0]
def get_model_predictions(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> (Dict[str, Any], List): model.eval() model_predictions = [] iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) return model.get_metrics(), model_predictions
def evaluate(model: Model, dataset: InstanceCollection, iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() generator = iterator(dataset, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join( ["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description) return model.get_metrics()
def evaluate( model: Model, data_loader: DataLoader, cuda_device: int, batch_weight_key: str, ) -> Dict[str, Any]: check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = iter(data_loader) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=len(data_loader)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics ): logger.warning( 'Metrics with names beginning with "_" will ' "not be logged to the tqdm progress bar." ) HasBeenWarned.tqdm_ignores_underscores = True description = ( ", ".join( [ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ] ) + " ||" ) generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes " + "produced a loss!" ) final_metrics["loss"] = total_loss / total_weight return final_metrics
def evaluate_predict(model: Model, dataset: Dataset, iterator: DataIterator, cuda_device: int, predict_file: TextIO, gold_file: TextIO) -> Dict[str, Any]: model.eval( ) #sets the model to evaluation mode--no dropout, batchnorm, other stuff? logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm(dataset.instances, total=len(dataset.instances)) # Recompile instances into sentences (each instance has only one predicate, but # multiple instances come from a single sentence and should be printed thus) # Map sentence indices to values all_words = {} all_predicate_inds = defaultdict(list) all_gold_senses = defaultdict(list) all_predicted_senses = defaultdict(list) all_pos_tags = defaultdict(list) all_gold_tags = defaultdict(list) all_predicted_tags = defaultdict(list) print("setting up conll output") for idx, instance in enumerate(generator_tqdm): output = model.forward_on_instance(instance, cuda_device, calculate_loss=False) predicted_tags = output['tags'] pos_tags = instance.pos_tags gold_senses = instance.fields['pred_sense'].label gold_tags = instance.fields['tags'].labels tokens = instance.fields['tokens'].tokens words = [t.text for t in tokens] pred_indices = instance.fields['pred_indicator'].labels sense_probabilities = output['psd_probabilities'] predicted_sense = output['sense'] if predicted_sense == model.vocab._oov_token: # not a real predicate sense, because we didn't recognize the predicate # guess it with a heuristic tok_lemma = instance.fields['pred_sense_set'].index_label predicted_sense = tok_lemma.split(':')[-1] + '.01' if hasattr(instance, 'sentence_id'): sid = instance.sentence_id else: sid = instance.fields["metadata"].metadata["sentence_id"] if sid in all_words: assert all_words[sid] == words else: all_words[sid] = words all_predicate_inds[sid].append(pred_indices) all_gold_senses[sid].append(gold_senses) all_predicted_senses[sid].append(predicted_sense) all_gold_tags[sid].append(gold_tags) all_predicted_tags[sid].append(predicted_tags) all_pos_tags[sid] = pos_tags for sid in all_words: write_to_conll_2009_eval_file( predict_file, gold_file, all_words[sid], all_predicate_inds[sid], all_gold_senses[sid], all_predicted_senses[sid], all_gold_tags[sid], all_predicted_tags[sid], all_pos_tags[sid]) print("printed conll output") return True
def evaluate(model: Model, dataset: Dataset, iterator: BasicIterator, cuda_device: int, serialization_directory: str) -> Dict[str, Any]: model.eval() generator = iterator(dataset, num_epochs=1, cuda_device=cuda_device, shuffle=False, for_training=False) logger.info("Iterating over dataset") generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join([ "%s: %.5f" % (name, value) for name, value in metrics.items() if "overall" in name ]) + " ||" generator_tqdm.set_description(description) metrics = model.get_metrics() golds = metrics["gold_spans"] predictions = metrics["predicted_spans"] assert len(dataset.instances) == len(golds) == len(predictions) # gold_file_path = os.path.join(serialization_directory, "gold.txt") prediction_file_path = os.path.join(serialization_directory, "predictions.txt") prediction_file = open(prediction_file_path, "w+") # gold_file = open(gold_file_path, "w+") logger.info("Writing predictions in CoNLL-like format to %s", prediction_file_path) for instance, gold, prediction in tqdm.tqdm( zip(dataset.instances, golds, predictions)): fields = instance.fields if "targets" in fields: verb_index = fields["targets"].labels.index(1) elif "verb_indicator" in fields: try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None else: verb_index = None frame = None if "frame" in fields: frame = fields["frame"].tokens[0].text gf = None if "gf" in fields: gf = [g.text for g in fields["gf"].tokens] pt = None if "pt" in fields: pt = [p.text for p in fields["pt"].tokens] sentence = [token.text for token in fields["tokens"].tokens] gold_tags = convert_spans_to_seq(gold, len(sentence)) predicted_tags = convert_spans_to_seq(prediction, len(sentence)) assert len(sentence) == len(gold_tags) == len(predicted_tags) write_to_conll_eval_file( prediction_file, # gold_file, verb_index, sentence, predicted_tags, gold_tags, frame, gf, pt) return model.get_metrics()
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 # ksk total_probs, all_example_ids = [], [] for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight # ksk if 'probs' in output_dict: total_probs.extend(output_dict['probs']) all_example_ids.extend([ batch['metadata'][batch_index]['example_ids'] for batch_index in range(len(batch['metadata'])) ]) if (not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") HasBeenWarned.tqdm_ignores_underscores = True description = ', '.join([ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight # ksk if 'probs' in output_dict: total_probs.extend(output_dict['probs']) all_example_ids.extend([ batch['metadata'][batch_index]['example_ids'] for batch_index in range(len(batch['metadata'])) ]) final_metrics["probs"] = total_probs final_metrics["example_ids"] = all_example_ids return final_metrics
def evaluate( model: Model, data_loader: DataLoader, cuda_device: int = -1, batch_weight_key: str = None, ) -> Dict[str, Any]: """ # Parameters model : `Model` The model to evaluate data_loader : `DataLoader` The `DataLoader` that will iterate over the evaluation data (data loaders already contain their data). cuda_device : `int`, optional (default=`-1`) The cuda device to use for this evaluation. The model is assumed to already be using this device; this parameter is only used for moving the input data to the correct device. batch_weight_key : `str`, optional (default=`None`) If given, this is a key in the output dictionary for each batch that specifies how to weight the loss for that batch. If this is not given, we use a weight of 1 for every batch. """ check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = iter(data_loader) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics ): logger.warning( 'Metrics with names beginning with "_" will ' "not be logged to the tqdm progress bar." ) HasBeenWarned.tqdm_ignores_underscores = True description = ( ", ".join( [ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ] ) + " ||" ) generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes " + "produced a loss!" ) final_metrics["loss"] = total_loss / total_weight return final_metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") ############ Comment out this block to save class_probabilities, logits, and losses for each batch ######### # print(output_dict['class_probabilities'].shape) # import copy # # newoutput_dict = copy.deepcopy(output_dict) # newoutput_dict['class_probabilities'] = newoutput_dict['class_probabilities'].cpu().data.numpy() # newoutput_dict['logits'] = newoutput_dict['logits'].cpu().data.numpy() # newoutput_dict['loss'] = newoutput_dict['loss'].cpu().data.numpy() # # output_file = os.path.join(os.path.dirname(__file__), '..', "data", "test", # str(batch_count) + "_output.pkl") # import json # import pickle # if output_file: # with open(output_file, "wb") as file: # pickle.dump(newoutput_dict, file) # file.close() # ########################################################################################################### metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") HasBeenWarned.tqdm_ignores_underscores = True description = ', '.join([ "%s: %.4f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight return final_metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError("The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight return final_metrics
def get_predictions(self, instances: List[Instance], model: Model, cuda_device: int = -1, prediction_file: Optional[str] = None, visualization_file: Optional[str] = None, verbose: bool = False) -> List[Dict]: """ We use this function to get predictions We use a basic itereator, since a bucket iterator shuffles data, even for shuffle=False Arguments: data (List[Instance]) : The list of instances for inference model (Model) : The model being used for predictions cuda_device (int) : The cuda device being used for processing verbose (bool) : Log accuracies and such Returns: predictions (List[Dict]) : The predictions. Each contains the following keys * text (List[str]): The tokens * pred (List[Tuple[str, float]]): The predicted labels and probs. Can potentially have multiple labels being predicted * gold (List[str]): The gold labels can potentially have multiple gold labels * pred_labels (List[str]): Predicted labels for segmentation Note that an this method is implemented by the base classes * attn (Dict[str, List[float]]) : A dictionary mapping tags to attention values * gold_labels : The gold labels for segmentation The gold labels for segmentation Additionally, this class stores the base_predictions, as well as the visualization, if visualization is set to True, and base_dir is provided """ iterator = self._iterator(instances, num_epochs=1, shuffle=False, cuda_device=cuda_device, for_training=False) model.eval() num_batches = self._iterator.get_num_batches(instances) inference_generator_tqdm = Tqdm.tqdm(iterator, total=num_batches) predictions = [] index = 0 matrix = { self._indexer.ix2tags[ix]: { "tp": 0., "fp": 0, "fn": 0., "tn": 0. } for ix in range(len(self._indexer.ix2tags)) } for batch in inference_generator_tqdm: # Currently I don't support multi-gpu data parallel output_dict = model.decode(model(**batch)) for ix in range(len(output_dict["preds"])): text = self._get_text_from_instance(instances[index]) pred = output_dict["preds"][ix] gold = [ self._indexer.get_tag(label) for label in instances[index].fields['labels'].labels ] attn = output_dict["attentions"][ix] gold_labels = instances[index].fields['tags'].labels assert all([len(attn[x]) == len(text) for x in attn]) gold_labels = self._indexer.extract_relevant(gold_labels) pred_labels = self.get_segmentation_from_prediction( text=text, preds_probs=pred, attns=attn) assert len(pred_labels) == len(gold_labels) == len(text) gold_set = set(gold) pred_set, _ = [set(list(x)) for x in zip(*pred)] # import pdb; pdb.set_trace() for tag in matrix: if tag in gold_set and tag in pred_set: matrix[tag]["tp"] += 1 elif tag not in gold_set and tag in pred_set: matrix[tag]["fp"] += 1 elif tag in gold_set and tag not in pred_set: matrix[tag]["fn"] += 1. else: matrix[tag]["tn"] += 1. preds = [[x[0], float(x[1])] for x in pred] prediction = { "text": text, "pred": preds, "gold": gold, "attn": attn, "pred_labels": pred_labels, "gold_labels": gold_labels } predictions.append(prediction) index += 1 if prediction_file is not None and prediction_file != "": with open(prediction_file, "w") as f: json.dump(predictions, f, ensure_ascii=True, indent=4) if visualization_file is not None and self._visualize and \ visualization_file != "": self.visualize(predictions, visualization_file) if verbose: accs = [] for tag in matrix: acc = (matrix[tag]["tp"] + matrix[tag]["tn"]) / \ sum(matrix[tag].values()) * 100. logger.info(f"Tag: {tag}, Acc: {acc:.2f}") accs.append(acc) avg_acc = sum(accs) / len(accs) logger.info(f"Average ACC: {avg_acc:.2f}") p, r, f = fscore_from_preds(predictions, False) return predictions
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 runtime = [] prev_time = time.time() for batch in generator_tqdm: batch_count += 1 #if batch_count == 1000: # runtime = np.asarray(runtime) # print("Mean:", np.mean(runtime)) # print("Std:", np.std(runtime)) # exit(-1) batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not HasBeenWarned.tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") HasBeenWarned.tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) runtime.append(time.time() - prev_time) prev_time = time.time() final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError("The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight return final_metrics