def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             output_file: str = None) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    with ExitStack() as stack:
        if output_file is None:
            file_handle = None
        else:
            file_handle = stack.enter_context(open(output_file, 'w'))
        for batch in generator_tqdm:
            model_output = model(**batch)
            metrics = model.get_metrics()
            if file_handle:
                id2label = model.vocab.get_index_to_token_vocabulary("labels")
                _persist_data(file_handle, batch.get("metadata"), model_output, id2label=id2label)
            description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
            generator_tqdm.set_description(description)

    return model.get_metrics()
Example #2
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
        for batch in generator_tqdm:
            batch = util.move_to_device(batch, cuda_device)
            model(**batch)
            metrics = model.get_metrics()
            if (not _warned_tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        return model.get_metrics(reset=True)
def evaluate(model: Model, instances: Iterable[Instance],
             data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances, num_epochs=1, shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(
            iterator, total=data_iterator.get_num_batches(instances))
        for batch in generator_tqdm:
            batch = util.move_to_device(batch, cuda_device)
            model(**batch)
            metrics = model.get_metrics()
            if (not _warned_tqdm_ignores_underscores and any(
                    metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join([
                "%s: %.2f" % (name, value)
                for name, value in metrics.items() if not name.startswith("_")
            ]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        return model.get_metrics(reset=True)
Example #4
0
    def ensure_model_can_train_save_and_load(self,
                                             model: Model,
                                             dataset: Dataset,
                                             iterator: DataIterator = None):
        model.eval()  # set eval mode, to turn off things like dropout
        data_iterator = iterator or BasicIterator()
        single_batch = next(data_iterator(dataset))
        single_batch = arrays_to_variables(single_batch)
        model_predictions = model.forward(**single_batch)

        # Check loss exists and we can compute gradients.
        model_loss = model_predictions["loss"]
        assert model_loss is not None
        model_loss.backward()

        torch.save(model.state_dict(), self.MODEL_FILE)
        loaded_model = model
        loaded_model.zero_grad()
        loaded_model.load_state_dict(torch.load(self.MODEL_FILE))
        loaded_model.eval()  # set eval mode, to turn off things like dropout
        loaded_model_predictions = loaded_model.forward(**single_batch)

        # Check loaded model's loss exists and we can compute gradients.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values
        # for these keys should be close.
        for key in model_predictions.keys():
            assert_allclose(model_predictions[key].data.numpy(),
                            loaded_model_predictions[key].data.numpy())

        return model, loaded_model
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             output_file: str = None,
             eval_type: str = None) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator,
                               total=data_iterator.get_num_batches(instances))
    with ExitStack() as stack:
        if output_file is None:
            file_handle = None
        else:
            file_handle = stack.enter_context(open(output_file, 'w'))
        for batch in generator_tqdm:
            ## made cuda compatible (if needed)
            batch = move_to_device(batch, cuda_device)

            model_output = model(**batch)
            metrics = model.get_metrics()
            if file_handle:
                _persist_data(file_handle, batch.get("metadata"), model_output,
                              eval_type)
            description = ', '.join([
                "%s: %.2f" % (name, value) for name, value in metrics.items()
            ]) + " ||"
            generator_tqdm.set_description(description)

    return model.get_metrics(reset=True)
Example #6
0
def evaluate(model: Model, instances: Iterable[Instance], task_name: str,
             data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]:
    """
    Evaluate a model for a particular tasks (usually after training).
    
    Parameters
    ----------
    model : ``allennlp.models.model.Model``, required
        The model to evaluate
    instances : ``Iterable[Instance]``, required
        The (usually test) dataset on which to evalute the model.
    task_name : ``str``, required
        The name of the tasks on which evaluate the model.
    data_iterator : ``DataIterator``
        Iterator that go through the dataset.
    cuda_device : ``int``
        Cuda device to use.
        
    Returns
    -------
    metrics :  ``Dict[str, Any]``
        A dictionary containing the metrics on the evaluated dataset.
    """
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances, num_epochs=1, shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = tqdm.tqdm(
            iterator, total=data_iterator.get_num_batches(instances))

        eval_loss = 0
        nb_batches = 0
        for tensor_batch in generator_tqdm:
            nb_batches += 1

            train_stages = ["stm", "sd", "valid"]
            task_index = TASKS_NAME.index(task_name)
            tensor_batch['task_index'] = torch.tensor(task_index)
            tensor_batch["reverse"] = torch.tensor(False)
            tensor_batch['for_training'] = torch.tensor(False)
            train_stage = train_stages.index("stm")
            tensor_batch['train_stage'] = torch.tensor(train_stage)
            tensor_batch = move_to_device(tensor_batch, 0)

            eval_output_dict = model.forward(**tensor_batch)
            loss = eval_output_dict["loss"]
            eval_loss += loss.item()
            metrics = model.get_metrics(task_name=task_name)
            metrics["stm_loss"] = float(eval_loss / nb_batches)

            description = training_util.description_from_metrics(metrics)
            generator_tqdm.set_description(description, refresh=False)

        metrics = model.get_metrics(task_name=task_name, reset=True)
        metrics["stm_loss"] = float(eval_loss / nb_batches)
        return metrics
Example #7
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             label_fname: str) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        label_file = open(label_fname, 'w')
        label_file.write('real_label,guessed_label\n')

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
        total_num_inst = 0
        for batch in generator_tqdm:
            num_inst = batch['tokens']['tokens'].size(0)
            total_num_inst += num_inst
            batch = util.move_to_device(batch, cuda_device)

            output_dict = model(**batch)
            if cuda_device == -1:
                output_matrix = output_dict['label_logits'].data.numpy()
            else:
                output_matrix = output_dict['label_logits'].data.cpu().numpy()
            output_labels = np.argmax(output_matrix, axis=1)
            if cuda_device == -1:
                true_labels = batch['label'].data.numpy()
            else:
                true_labels = batch['label'].data.cpu().numpy()
            assert true_labels.shape[0] == output_labels.shape[0]
            for i in range(true_labels.shape[0]):
                label_file.write(str(int(true_labels[i])) + ',')
                label_file.write(str(int(output_labels[i])) + '\n')

            metrics = model.get_metrics()
            if (not _warned_tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)


        print("NUM INSTANCES ITERATED OVER: " + str(total_num_inst))
        label_file.close()

        return model.get_metrics(reset=True)
Example #8
0
def evaluate(model: Model, instances: Iterable[Instance], task_name: str,
             data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]:
    """
    Evaluate a model for a particular task (usually after training).
    
    Parameters
    ----------
    model : ``allennlp.models.model.Model``, required
        The model to evaluate
    instances : ``Iterable[Instance]``, required
        The (usually test) dataset on which to evalute the model.
    task_name : ``str``, required
        The name of the task on which evaluate the model.
    data_iterator : ``DataIterator``
        Iterator that go through the dataset.
    cuda_device : ``int``
        Cuda device to use.
        
    Returns
    -------
    metrics :  ``Dict[str, Any]``
        A dictionary containing the metrics on the evaluated dataset.
    """
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances, num_epochs=1, shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = tqdm.tqdm(
            iterator, total=data_iterator.get_num_batches(instances))

        eval_loss = 0
        nb_batches = 0
        for batch in generator_tqdm:
            batch = util.move_to_device(batch, cuda_device)
            nb_batches += 1

            eval_output_dict = model.forward(task_name=task_name,
                                             tensor_batch=batch)
            loss = eval_output_dict["loss"]
            eval_loss += loss.item()
            metrics = model.get_metrics(task_name=task_name)
            metrics["loss"] = float(eval_loss / nb_batches)

            description = ", ".join([
                "%s: %.2f" % (name, value) for name, value in metrics.items()
            ]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        metrics = model.get_metrics(task_name=task_name, reset=True, full=True)
        metrics["loss"] = float(eval_loss / nb_batches)
        return metrics
Example #9
0
def evaluate(model: Model,
             dataset: Dataset,
             iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    generator = iterator(dataset, num_epochs=1)
    logger.info("Iterating over dataset")
    for batch in tqdm.tqdm(generator):
        tensor_batch = arrays_to_variables(batch, cuda_device, for_training=False)
        model.forward(**tensor_batch)

    return model.get_metrics()
Example #10
0
def evaluate(model: Model, dataset: Dataset, iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    generator = iterator(dataset, num_epochs=1)
    logger.info("Iterating over dataset")
    generator_tqdm = tqdm.tqdm(generator,
                               total=iterator.get_num_batches(dataset))
    output = pd.DataFrame()
    for raw_batch, batch in generator_tqdm:
        raw_fields = [x.fields for x in raw_batch.instances]
        parsed_fields = []

        for item in raw_fields:
            premise = " ".join([x.text for x in item['premise'].tokens])
            hypothesis = " ".join([x.text for x in item['hypothesis'].tokens])
            label = item['label'].label
            parsed_fields.append({
                "sentence1": premise,
                "sentence2": hypothesis,
                "gold_label": label
            })
        parsed_fields = pd.DataFrame(parsed_fields)
        tensor_batch = arrays_to_variables(batch,
                                           cuda_device,
                                           for_training=False)
        bo = model.forward(**tensor_batch)
        metrics = model.get_metrics()
        description = ', '.join(
            ["%s: %.2f" % (name, value)
             for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description)
        batch_output = pd.DataFrame()
        INVERSE_LABEL_MAP = {
            0: "entailment",
            1: "neutral",
            2: "contradiction",
            3: "hidden"
        }
        batch_output['prediction_label'] = bo['label_logits'].data.numpy(
        ).argmax(axis=1)
        batch_output['prediction_score'] = bo['label_probs'].data.numpy().max(
            axis=1)
        batch_output['prediction_label'] = batch_output.prediction_label.apply(
            lambda x: INVERSE_LABEL_MAP[x])
        parsed_output = pd.concat([parsed_fields, batch_output], axis=1)
        output = pd.concat([output, parsed_output], axis=0)
    hard_subset = output.loc[output.gold_label != output.prediction_label]
    easy_subset = output.loc[output.gold_label == output.prediction_label]
    return model.get_metrics(), hard_subset, easy_subset
Example #11
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))

        batch_count = 0
        loss_count = 0
        total_loss = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = util.move_to_device(batch, cuda_device)
            loss = model(**batch).get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                metrics["loss"] = loss.item()
                total_loss += loss.item()

            if (not _warned_tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            if loss_count != batch_count:
                raise RuntimeError("The model you are trying to evaluate only sometimes " +
                                   "produced a loss!")
            final_metrics["loss"] = total_loss/batch_count

        return final_metrics
Example #12
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    for batch in generator_tqdm:
        model(**batch)
        metrics = model.get_metrics()
        description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description, refresh=False)

    return model.get_metrics(reset=True)
Example #13
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    for batch in generator_tqdm:
        model(**batch)
        metrics = model.get_metrics()
        description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description, refresh=False)

    return model.get_metrics()
Example #14
0
def evaluate(model: Model,
             dataset: Dataset,
             iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    generator = iterator(dataset, num_epochs=1)
    logger.info("Iterating over dataset")
    generator_tqdm = tqdm.tqdm(generator, total=iterator.get_num_batches(dataset))
    for batch in generator_tqdm:
        tensor_batch = arrays_to_variables(batch, cuda_device, for_training=False)
        model.forward(**tensor_batch)
        metrics = model.get_metrics()
        description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description)

    return model.get_metrics()
def get_iter_norm_mean_eval(
    model: Model, 
    data_loader: DataLoader, 
    mean: torch.Tensor,
    cuda_device: int = -1
) -> Dict[str, Any]:
    """
    # Parameters

    model : `Model`
        The model to evaluate
    data_loader : `DataLoader`
        The `DataLoader` that will iterate over the evaluation data (data loaders already contain
        their data).
    cuda_device : `int`, optional (default=`-1`)
        The cuda device to use for this evaluation.  The model is assumed to already be using this
        device; this parameter is only used for moving the input data to the correct device.
    batch_weight_key : `str`, optional (default=`None`)
        If given, this is a key in the output dictionary for each batch that specifies how to weight
        the loss for that batch.  If this is not given, we use a weight of 1 for every batch.
    """
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = iter(data_loader)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator)

        # mean_embeddings: [torch.Tensor, int]
        # mean_embeddings = [torch.tensor([0.], device=cuda_device), 0]
        embeddings = []
        for batch in generator_tqdm:
            batch = nn_util.move_to_device(batch, cuda_device)
            batch_embeddings = model.forward_embeddings(batch['words'], mean)
            # mean_embeddings[0] = (mean_embeddings[0] + batch_embeddings.sum(dim=0)) 
            # mean_embeddings[1] += batch_embeddings.shape[0]
            embeddings.append(batch_embeddings)

        # mean_embeddings[0] = mean_embeddings[0] / mean_embeddings[1]
        embeddings = torch.cat(embeddings, dim=0)

    return embeddings.mean(dim=0), embeddings # mean_embeddings[0]
def get_model_predictions(model: Model, instances: Iterable[Instance],
                          data_iterator: DataIterator,
                          cuda_device: int) -> (Dict[str, Any], List):

    model.eval()
    model_predictions = []

    iterator = data_iterator(instances,
                             num_epochs=1,
                             cuda_device=cuda_device,
                             for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator,
                               total=data_iterator.get_num_batches(instances))
    for batch in generator_tqdm:
        result = model(**batch)
        predictions = model.decode(result)
        model_predictions.extend(predictions["tags"])

    return model.get_metrics(), model_predictions
Example #17
0
def evaluate(model: Model, dataset: InstanceCollection, iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    generator = iterator(dataset,
                         num_epochs=1,
                         cuda_device=cuda_device,
                         for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = tqdm.tqdm(generator,
                               total=iterator.get_num_batches(dataset))
    for batch in generator_tqdm:
        model(**batch)
        metrics = model.get_metrics()
        description = ', '.join(
            ["%s: %.2f" % (name, value)
             for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description)

    return model.get_metrics()
Example #18
0
def evaluate(
    model: Model, data_loader: DataLoader, cuda_device: int, batch_weight_key: str,
) -> Dict[str, Any]:
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = iter(data_loader)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=len(data_loader))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if not HasBeenWarned.tqdm_ignores_underscores and any(
                metric_name.startswith("_") for metric_name in metrics
            ):
                logger.warning(
                    'Metrics with names beginning with "_" will '
                    "not be logged to the tqdm progress bar."
                )
                HasBeenWarned.tqdm_ignores_underscores = True
            description = (
                ", ".join(
                    [
                        "%s: %.2f" % (name, value)
                        for name, value in metrics.items()
                        if not name.startswith("_")
                    ]
                )
                + " ||"
            )
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes " + "produced a loss!"
                )
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics
Example #19
0
def evaluate_predict(model: Model, dataset: Dataset, iterator: DataIterator,
                     cuda_device: int, predict_file: TextIO,
                     gold_file: TextIO) -> Dict[str, Any]:
    model.eval(
    )  #sets the model to evaluation mode--no dropout, batchnorm, other stuff?

    logger.info("Iterating over dataset")
    generator_tqdm = tqdm.tqdm(dataset.instances, total=len(dataset.instances))

    # Recompile instances into sentences (each instance has only one predicate, but
    #   multiple instances come from a single sentence and should be printed thus)
    # Map sentence indices to values
    all_words = {}
    all_predicate_inds = defaultdict(list)
    all_gold_senses = defaultdict(list)
    all_predicted_senses = defaultdict(list)
    all_pos_tags = defaultdict(list)
    all_gold_tags = defaultdict(list)
    all_predicted_tags = defaultdict(list)

    print("setting up conll output")

    for idx, instance in enumerate(generator_tqdm):
        output = model.forward_on_instance(instance,
                                           cuda_device,
                                           calculate_loss=False)
        predicted_tags = output['tags']
        pos_tags = instance.pos_tags
        gold_senses = instance.fields['pred_sense'].label
        gold_tags = instance.fields['tags'].labels
        tokens = instance.fields['tokens'].tokens
        words = [t.text for t in tokens]
        pred_indices = instance.fields['pred_indicator'].labels

        sense_probabilities = output['psd_probabilities']

        predicted_sense = output['sense']
        if predicted_sense == model.vocab._oov_token:
            # not a real predicate sense, because we didn't recognize the predicate
            # guess it with a heuristic
            tok_lemma = instance.fields['pred_sense_set'].index_label
            predicted_sense = tok_lemma.split(':')[-1] + '.01'
        if hasattr(instance, 'sentence_id'):
            sid = instance.sentence_id
        else:
            sid = instance.fields["metadata"].metadata["sentence_id"]
        if sid in all_words:
            assert all_words[sid] == words
        else:
            all_words[sid] = words
        all_predicate_inds[sid].append(pred_indices)
        all_gold_senses[sid].append(gold_senses)
        all_predicted_senses[sid].append(predicted_sense)
        all_gold_tags[sid].append(gold_tags)
        all_predicted_tags[sid].append(predicted_tags)
        all_pos_tags[sid] = pos_tags

    for sid in all_words:
        write_to_conll_2009_eval_file(
            predict_file, gold_file, all_words[sid], all_predicate_inds[sid],
            all_gold_senses[sid], all_predicted_senses[sid],
            all_gold_tags[sid], all_predicted_tags[sid], all_pos_tags[sid])
    print("printed conll output")

    return True
Example #20
0
def evaluate(model: Model, dataset: Dataset, iterator: BasicIterator,
             cuda_device: int, serialization_directory: str) -> Dict[str, Any]:
    model.eval()

    generator = iterator(dataset,
                         num_epochs=1,
                         cuda_device=cuda_device,
                         shuffle=False,
                         for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = tqdm.tqdm(generator,
                               total=iterator.get_num_batches(dataset))

    for batch in generator_tqdm:
        model(**batch)
        metrics = model.get_metrics()
        description = ', '.join([
            "%s: %.5f" % (name, value)
            for name, value in metrics.items() if "overall" in name
        ]) + " ||"
        generator_tqdm.set_description(description)

    metrics = model.get_metrics()
    golds = metrics["gold_spans"]
    predictions = metrics["predicted_spans"]
    assert len(dataset.instances) == len(golds) == len(predictions)

    # gold_file_path = os.path.join(serialization_directory, "gold.txt")
    prediction_file_path = os.path.join(serialization_directory,
                                        "predictions.txt")
    prediction_file = open(prediction_file_path, "w+")
    # gold_file = open(gold_file_path, "w+")
    logger.info("Writing predictions in CoNLL-like format to %s",
                prediction_file_path)

    for instance, gold, prediction in tqdm.tqdm(
            zip(dataset.instances, golds, predictions)):
        fields = instance.fields
        if "targets" in fields:
            verb_index = fields["targets"].labels.index(1)
        elif "verb_indicator" in fields:
            try:
                # Most sentences have a verbal predicate, but not all.
                verb_index = fields["verb_indicator"].labels.index(1)
            except ValueError:
                verb_index = None
        else:
            verb_index = None

        frame = None
        if "frame" in fields:
            frame = fields["frame"].tokens[0].text
        gf = None
        if "gf" in fields:
            gf = [g.text for g in fields["gf"].tokens]
        pt = None
        if "pt" in fields:
            pt = [p.text for p in fields["pt"].tokens]

        sentence = [token.text for token in fields["tokens"].tokens]

        gold_tags = convert_spans_to_seq(gold, len(sentence))
        predicted_tags = convert_spans_to_seq(prediction, len(sentence))
        assert len(sentence) == len(gold_tags) == len(predicted_tags)

        write_to_conll_eval_file(
            prediction_file,
            #  gold_file,
            verb_index,
            sentence,
            predicted_tags,
            gold_tags,
            frame,
            gf,
            pt)

    return model.get_metrics()
Example #21
0
def evaluate(model: Model, instances: Iterable[Instance],
             data_iterator: DataIterator, cuda_device: int,
             batch_weight_key: str) -> Dict[str, Any]:
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances, num_epochs=1, shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(
            iterator, total=data_iterator.get_num_batches(instances))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0
        # ksk
        total_probs, all_example_ids = [], []

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight
                # ksk
                if 'probs' in output_dict:
                    total_probs.extend(output_dict['probs'])
                    all_example_ids.extend([
                        batch['metadata'][batch_index]['example_ids']
                        for batch_index in range(len(batch['metadata']))
                    ])

            if (not HasBeenWarned.tqdm_ignores_underscores and any(
                    metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                HasBeenWarned.tqdm_ignores_underscores = True
            description = ', '.join([
                "%s: %.2f" % (name, value)
                for name, value in metrics.items() if not name.startswith("_")
            ]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes " +
                    "produced a loss!")
            final_metrics["loss"] = total_loss / total_weight
            # ksk
            if 'probs' in output_dict:
                total_probs.extend(output_dict['probs'])
                all_example_ids.extend([
                    batch['metadata'][batch_index]['example_ids']
                    for batch_index in range(len(batch['metadata']))
                ])
                final_metrics["probs"] = total_probs
                final_metrics["example_ids"] = all_example_ids

        return final_metrics
Example #22
0
def evaluate(
    model: Model, data_loader: DataLoader, cuda_device: int = -1, batch_weight_key: str = None,
) -> Dict[str, Any]:
    """
    # Parameters

    model : `Model`
        The model to evaluate
    data_loader : `DataLoader`
        The `DataLoader` that will iterate over the evaluation data (data loaders already contain
        their data).
    cuda_device : `int`, optional (default=`-1`)
        The cuda device to use for this evaluation.  The model is assumed to already be using this
        device; this parameter is only used for moving the input data to the correct device.
    batch_weight_key : `str`, optional (default=`None`)
        If given, this is a key in the output dictionary for each batch that specifies how to weight
        the loss for that batch.  If this is not given, we use a weight of 1 for every batch.
    """
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = iter(data_loader)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator)

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if not HasBeenWarned.tqdm_ignores_underscores and any(
                metric_name.startswith("_") for metric_name in metrics
            ):
                logger.warning(
                    'Metrics with names beginning with "_" will '
                    "not be logged to the tqdm progress bar."
                )
                HasBeenWarned.tqdm_ignores_underscores = True
            description = (
                ", ".join(
                    [
                        "%s: %.2f" % (name, value)
                        for name, value in metrics.items()
                        if not name.startswith("_")
                    ]
                )
                + " ||"
            )
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes " + "produced a loss!"
                )
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics
Example #23
0
def evaluate(model: Model, instances: Iterable[Instance],
             data_iterator: DataIterator, cuda_device: int,
             batch_weight_key: str) -> Dict[str, Any]:
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances, num_epochs=1, shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(
            iterator, total=data_iterator.get_num_batches(instances))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            ############ Comment out this block to save class_probabilities, logits, and losses for each batch #########
            # print(output_dict['class_probabilities'].shape)
            # import copy
            #
            # newoutput_dict = copy.deepcopy(output_dict)
            # newoutput_dict['class_probabilities'] = newoutput_dict['class_probabilities'].cpu().data.numpy()
            # newoutput_dict['logits'] = newoutput_dict['logits'].cpu().data.numpy()
            # newoutput_dict['loss'] = newoutput_dict['loss'].cpu().data.numpy()
            #
            # output_file = os.path.join(os.path.dirname(__file__), '..', "data", "test",
            #                            str(batch_count) + "_output.pkl")
            # import json
            # import pickle
            # if output_file:
            #     with open(output_file, "wb") as file:
            #         pickle.dump(newoutput_dict, file)
            #     file.close()
            # ###########################################################################################################

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if (not HasBeenWarned.tqdm_ignores_underscores and any(
                    metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                HasBeenWarned.tqdm_ignores_underscores = True
            description = ', '.join([
                "%s: %.4f" % (name, value)
                for name, value in metrics.items() if not name.startswith("_")
            ]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes " +
                    "produced a loss!")
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics
Example #24
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             batch_weight_key: str) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if (not _warned_tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError("The model you are trying to evaluate only sometimes " +
                                   "produced a loss!")
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics
Example #25
0
    def get_predictions(self,
                        instances: List[Instance],
                        model: Model,
                        cuda_device: int = -1,
                        prediction_file: Optional[str] = None,
                        visualization_file: Optional[str] = None,
                        verbose: bool = False) -> List[Dict]:
        """
        We use this function to get predictions
        We use a basic itereator, since a bucket iterator shuffles
        data, even for shuffle=False

        Arguments:
            data (List[Instance]) : The list of instances for inference
            model (Model) : The model being used for predictions
            cuda_device (int) : The cuda device being used for processing
            verbose (bool) : Log accuracies and such

        Returns:
            predictions (List[Dict]) : The predictions. Each contains the
                following keys
                * text (List[str]): The tokens
                * pred (List[Tuple[str, float]]): The predicted labels and
                    probs. Can potentially have multiple labels being
                    predicted
                * gold (List[str]): The gold labels
                    can potentially have multiple gold labels
                * pred_labels (List[str]): Predicted labels for segmentation
                    Note that an this method is implemented by the base classes
                * attn (Dict[str, List[float]]) : A dictionary mapping tags to
                    attention values
                * gold_labels : The gold labels for segmentation
                    The gold labels for segmentation

        Additionally, this class stores the base_predictions, as well as the
            visualization, if visualization is set to True, and base_dir is
             provided
        """
        iterator = self._iterator(instances,
                                  num_epochs=1,
                                  shuffle=False,
                                  cuda_device=cuda_device,
                                  for_training=False)
        model.eval()
        num_batches = self._iterator.get_num_batches(instances)
        inference_generator_tqdm = Tqdm.tqdm(iterator, total=num_batches)
        predictions = []
        index = 0
        matrix = {
            self._indexer.ix2tags[ix]: {
                "tp": 0.,
                "fp": 0,
                "fn": 0.,
                "tn": 0.
            }
            for ix in range(len(self._indexer.ix2tags))
        }

        for batch in inference_generator_tqdm:
            # Currently I don't support multi-gpu data parallel
            output_dict = model.decode(model(**batch))
            for ix in range(len(output_dict["preds"])):
                text = self._get_text_from_instance(instances[index])
                pred = output_dict["preds"][ix]
                gold = [
                    self._indexer.get_tag(label)
                    for label in instances[index].fields['labels'].labels
                ]
                attn = output_dict["attentions"][ix]
                gold_labels = instances[index].fields['tags'].labels
                assert all([len(attn[x]) == len(text) for x in attn])
                gold_labels = self._indexer.extract_relevant(gold_labels)
                pred_labels = self.get_segmentation_from_prediction(
                    text=text, preds_probs=pred, attns=attn)
                assert len(pred_labels) == len(gold_labels) == len(text)
                gold_set = set(gold)
                pred_set, _ = [set(list(x)) for x in zip(*pred)]
                # import pdb; pdb.set_trace()
                for tag in matrix:
                    if tag in gold_set and tag in pred_set:
                        matrix[tag]["tp"] += 1
                    elif tag not in gold_set and tag in pred_set:
                        matrix[tag]["fp"] += 1
                    elif tag in gold_set and tag not in pred_set:
                        matrix[tag]["fn"] += 1.
                    else:
                        matrix[tag]["tn"] += 1.
                preds = [[x[0], float(x[1])] for x in pred]
                prediction = {
                    "text": text,
                    "pred": preds,
                    "gold": gold,
                    "attn": attn,
                    "pred_labels": pred_labels,
                    "gold_labels": gold_labels
                }
                predictions.append(prediction)
                index += 1
        if prediction_file is not None and prediction_file != "":
            with open(prediction_file, "w") as f:
                json.dump(predictions, f, ensure_ascii=True, indent=4)
        if visualization_file is not None and self._visualize and \
                visualization_file != "":
            self.visualize(predictions, visualization_file)
        if verbose:
            accs = []
            for tag in matrix:
                acc = (matrix[tag]["tp"] + matrix[tag]["tn"]) / \
                    sum(matrix[tag].values()) * 100.
                logger.info(f"Tag: {tag}, Acc: {acc:.2f}")
                accs.append(acc)
            avg_acc = sum(accs) / len(accs)
            logger.info(f"Average ACC: {avg_acc:.2f}")
            p, r, f = fscore_from_preds(predictions, False)
        return predictions
Example #26
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             batch_weight_key: str) -> Dict[str, Any]:
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        runtime = []
        prev_time = time.time()
        for batch in generator_tqdm:
            batch_count += 1
            #if batch_count == 1000:
            #   runtime = np.asarray(runtime)
            #   print("Mean:", np.mean(runtime))
            #   print("Std:", np.std(runtime))
            #   exit(-1)
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if (not HasBeenWarned.tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                HasBeenWarned.tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)
            runtime.append(time.time() - prev_time)
            prev_time = time.time()

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError("The model you are trying to evaluate only sometimes " +
                                   "produced a loss!")
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics