Ejemplo n.º 1
0
    def test_perl_eval_script_can_run_on_printed_conll_files(self):
        bio_tags = ["B-ARG-1", "I-ARG-1", "O", "B-V", "B-ARGM-ADJ", "O"]
        sentence = ["Mark", "and", "Matt", "were", "running", "fast", "."]

        gold_file_path = os.path.join(self.TEST_DIR, "gold_conll_eval.txt")
        prediction_file_path = os.path.join(self.TEST_DIR,
                                            "prediction_conll_eval.txt")
        with open(gold_file_path,
                  "a+") as gold_file, open(prediction_file_path,
                                           "a+") as prediction_file:
            # Use the same bio tags as prediction vs gold to make it obvious by looking
            # at the perl script output if something is wrong. Write them twice to
            # ensure that the perl script deals with multiple sentences.
            write_to_conll_eval_file(gold_file, prediction_file, 4, sentence,
                                     bio_tags, bio_tags)
            write_to_conll_eval_file(gold_file, prediction_file, 4, sentence,
                                     bio_tags, bio_tags)

        perl_script_command = [
            "perl",
            str(self.TOOLS_ROOT / "srl-eval.pl"), prediction_file_path,
            gold_file_path
        ]
        exit_code = subprocess.check_call(perl_script_command)
        assert exit_code == 0
def main(serialization_directory, device):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    """

    config = Params.from_file(
        os.path.join(serialization_directory, "model_params.json"))
    dataset_reader = DatasetReader.from_params(config['dataset_reader'])
    evaluation_data_path = config['validation_data_path']

    model = Model.load(config,
                       serialization_dir=serialization_directory,
                       cuda_device=device)

    prediction_file_path = os.path.join(serialization_directory,
                                        "predictions.txt")
    gold_file_path = os.path.join(serialization_directory, "gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("Reading evaluation data from {}".format(evaluation_data_path))
    dataset = dataset_reader.read(evaluation_data_path)
    dataset.index_instances(model._vocab)
    iterator = BasicIterator(batch_size=32)

    model_predictions = []
    for batch in tqdm.tqdm(iterator(dataset, num_epochs=1, shuffle=False)):
        tensor_batch = arrays_to_variables(batch, device, for_training=False)
        result = model.forward(**tensor_batch)
        predictions = model.decode(result)
        model_predictions.extend(predictions["tags"])

    for instance, prediction in zip(dataset.instances, model_predictions):
        fields = instance.fields
        predicted_tags = [
            model._vocab.get_token_from_index(x, namespace="labels")
            for x in prediction
        ]
        try:
            # Most sentences have a verbal predicate, but not all.
            verb_index = fields["verb_indicator"].labels.index(1)
        except ValueError:
            verb_index = None

        gold_tags = fields["tags"].labels
        sentence = fields["tokens"].tokens

        write_to_conll_eval_file(prediction_file, gold_file, verb_index,
                                 sentence, gold_tags, predicted_tags)
    prediction_file.close()
    gold_file.close()
Ejemplo n.º 3
0
    def test_span_f1_matches_perl_script_for_continued_arguments(self):
        bio_tags = ["B-ARG1", "O", "B-C-ARG1", "B-V", "B-ARGM-ADJ", "O"]
        sentence = ["Mark", "and", "Matt", "were", "running", "fast", "."]

        gold_indices = [self.vocab.get_token_index(x, "tags") for x in bio_tags]
        gold_tensor = torch.Tensor([gold_indices])
        prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size("tags")])
        mask = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])

        # Make prediction so that it is exactly correct.
        for i, tag_index in enumerate(gold_indices):
            prediction_tensor[0, i, tag_index] = 1

        metric = SpanBasedF1Measure(self.vocab, "tags")
        metric(prediction_tensor, gold_tensor, mask)
        metric_dict = metric.get_metric()

        # We merged the continued ARG1 label into a single span, so there should
        # be exactly 1 true positive for ARG1 and nothing present for C-ARG1
        assert metric._true_positives["ARG1"] == 1
        # The labels containing continuation references get merged into
        # the labels that they continue, so they should never appear in
        # the precision/recall counts.
        assert "C-ARG1" not in metric._true_positives.keys()
        assert metric._true_positives["V"] == 1
        assert metric._true_positives["ARGM-ADJ"] == 1

        numpy.testing.assert_almost_equal(metric_dict["recall-ARG1"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict["precision-ARG1"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARG1"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict["recall-V"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict["precision-V"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict["f1-measure-V"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict["recall-ARGM-ADJ"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict["precision-ARGM-ADJ"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARGM-ADJ"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict["recall-overall"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict["precision-overall"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict["f1-measure-overall"], 1.0)

        # Check that the number of true positive ARG1 labels is the same as the perl script's output:
        gold_file_path = os.path.join(self.TEST_DIR, "gold_conll_eval.txt")
        prediction_file_path = os.path.join(self.TEST_DIR, "prediction_conll_eval.txt")
        with open(gold_file_path, "a+") as gold_file, open(prediction_file_path, "a+") as prediction_file:
            # Use the same bio tags as prediction vs gold to make it obvious by looking
            # at the perl script output if something is wrong.
            write_to_conll_eval_file(gold_file, prediction_file, 4, sentence, bio_tags, bio_tags)
        # Run the official perl script and collect stdout.
        perl_script_command = ["perl", str(self.TOOLS_ROOT / "srl-eval.pl"), prediction_file_path, gold_file_path]
        stdout = subprocess.check_output(perl_script_command, universal_newlines=True)
        stdout_lines = stdout.split("\n")
        # Parse the stdout of the perl script to find the ARG1 row (this happens to be line 8).
        num_correct_arg1_instances_from_perl_evaluation = int([token for token in
                                                               stdout_lines[8].split(" ") if token][1])
        assert num_correct_arg1_instances_from_perl_evaluation == metric._true_positives["ARG1"]
def write_predictions_viterbi_decoded(serialization_dir, split, epoch, predicted_tags,
                              vocab: Vocabulary,
                              tokens: Dict[str, torch.LongTensor],
                                verb_indicator: torch.LongTensor,
                                tags: torch.LongTensor = None,
                                pos_tags: torch.LongTensor = None,
                                spans: torch.LongTensor = None,
                                span_labels: torch.LongTensor = None,
                                metadata: Any = None):

    prediction_file_path = os.path.join(serialization_dir, "predictions", "predictions-" + split + "-" + str(epoch) + ".txt")
    gold_file_path = os.path.join(serialization_dir, "predictions", "gold-" + split + "-" + str(epoch) + ".txt")

    if not os.path.exists(os.path.dirname(prediction_file_path)):
        try:
            os.makedirs(os.path.dirname(prediction_file_path))
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    # logger.info("Writing gold srl tags (in conll file format) to %s", gold_file_path)
    # logger.info("Writing predicted srl tags (in conll file format) to %s", prediction_file_path)

    prediction_file = open(prediction_file_path, "a+")
    gold_file = open(gold_file_path, "a+")

    sentences = tokens["tokens"]
    mask = get_text_field_mask(tokens)
    sentence_lengths = get_lengths_from_binary_sequence_mask(mask).data.tolist()

    for sentence, _gold_tags, _verb_indicator, _length, _predicted_tags in zip(sentences.data.cpu(), tags.data.cpu(),
                                                            verb_indicator.data.cpu(), sentence_lengths, predicted_tags.data.cpu()):
        tokens = [vocab.get_token_from_index(x, namespace="tokens").__str__()
                  for x in sentence[:_length]]
        gold_labels = [vocab.get_token_from_index(x, namespace="labels")
                         for x in _gold_tags[:_length]]
        _verb_indicator = [x for x in _verb_indicator[: _length]]

        prediction = [vocab.get_token_from_index(x, namespace="labels")
                         for x in _predicted_tags[:_length]]

        try:
            verb_index = _verb_indicator.index(1)
        except ValueError:
            verb_index = None

        # Defined in semantic_role_labeler model implementation
        write_to_conll_eval_file(prediction_file=prediction_file, gold_file=gold_file,
                                 verb_index=verb_index, sentence=tokens, prediction=prediction,
                                 gold_labels=gold_labels)
    prediction_file.close()
    gold_file.close()
Ejemplo n.º 5
0
    def test_perl_eval_script_can_run_on_printed_conll_files(self):
        bio_tags = ["B-ARG-1", "I-ARG-1", "O", "B-V", "B-ARGM-ADJ", "O"]
        sentence = ["Mark", "and", "Matt", "were", "running", "fast", "."]

        gold_file_path = os.path.join(self.TEST_DIR, "gold_conll_eval.txt")
        prediction_file_path = os.path.join(self.TEST_DIR, "prediction_conll_eval.txt")
        with open(gold_file_path, "a+") as gold_file, open(prediction_file_path, "a+") as prediction_file:
            # Use the same bio tags as prediction vs gold to make it obvious by looking
            # at the perl script output if something is wrong. Write them twice to
            # ensure that the perl script deals with multiple sentences.
            write_to_conll_eval_file(gold_file, prediction_file, 4, sentence, bio_tags, bio_tags)
            write_to_conll_eval_file(gold_file, prediction_file, 4, sentence, bio_tags, bio_tags)

        perl_script_command = ["perl", str(self.TOOLS_ROOT / "srl-eval.pl"), prediction_file_path, gold_file_path]
        exit_code = subprocess.check_call(perl_script_command)
        assert exit_code == 0
def main(serialization_directory, device):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    """

    config = Params.from_file(os.path.join(serialization_directory, "config.json"))
    dataset_reader = DatasetReader.from_params(config['dataset_reader'])
    evaluation_data_path = config['validation_data_path']

    model = Model.load(config, serialization_dir=serialization_directory, cuda_device=device)

    prediction_file_path = os.path.join(serialization_directory, "predictions.txt")
    gold_file_path = os.path.join(serialization_directory, "gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("Reading evaluation data from {}".format(evaluation_data_path))
    instances = dataset_reader.read(evaluation_data_path)
    iterator = BasicIterator(batch_size=32)
    iterator.index_with(model.vocab)

    model_predictions = []
    batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device, for_training=False)
    for batch in Tqdm.tqdm(batches):
        result = model(**batch)
        predictions = model.decode(result)
        model_predictions.extend(predictions["tags"])

    for instance, prediction in zip(instances, model_predictions):
        fields = instance.fields
        try:
            # Most sentences have a verbal predicate, but not all.
            verb_index = fields["verb_indicator"].labels.index(1)
        except ValueError:
            verb_index = None

        gold_tags = fields["tags"].labels
        sentence = fields["tokens"].tokens

        write_to_conll_eval_file(prediction_file, gold_file,
                                 verb_index, sentence, prediction, gold_tags)
    prediction_file.close()
    gold_file.close()
def write_predictions(serialization_dir, instances, model_predictions, split, epoch = None):

    if epoch:
        prediction_file_path = os.path.join(serialization_dir, "predictions", "predictions-" + split + "-" + str(epoch) + ".txt")
        gold_file_path = os.path.join(serialization_dir, "predictions", "gold-" + split + "-" + str(epoch) + ".txt")
    else:
        prediction_file_path = os.path.join(serialization_dir, "predictions", "predictions-"+split+".txt")
        gold_file_path = os.path.join(serialization_dir, "predictions", "gold-"+split+".txt")

    if not os.path.exists(os.path.dirname(prediction_file_path)):
        try:
            os.makedirs(os.path.dirname(prediction_file_path))
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    logger.info("Writing gold srl tags (in conll file format) to %s", gold_file_path)
    logger.info("Writing predicted srl tags (in conll file format) to %s", prediction_file_path)

    prediction_file = open(prediction_file_path, "a+")
    gold_file = open(gold_file_path, "a+")

    for instance, prediction in zip(instances, model_predictions):
        fields = instance.fields
        try:
            # Most sentences have a verbal predicate, but not all.
            verb_index = fields["verb_indicator"].labels.index(1)
        except ValueError:
            verb_index = None

        gold_labels = fields["tags"].labels
        sentence = fields["tokens"].tokens

        # Defined in semantic_role_labeler model implementation
        write_to_conll_eval_file(prediction_file, gold_file,
                                 verb_index, sentence, prediction, gold_labels)
    prediction_file.close()
    gold_file.close()
def main(serialization_directory: int,
         device: int,
         data: str,
         prefix: str,
         domain: str = None):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    data: str, default = None
        The data to evaluate on. By default, we use the validation data from
        the original experiment.
    prefix: str, default=""
        The prefix to prepend to the generated gold and prediction files, to distinguish
        different models/data.
    domain: str, optional (default = None)
        If passed, filters the ontonotes evaluation/test dataset to only contain the
        specified domain. This overwrites the domain in the config file from the model,
        to allow evaluation on domains other than the one the model was trained on.
    """
    config = Params.from_file(os.path.join(serialization_directory, "config.json"))

    if domain is not None:
        # Hack to allow evaluation on different domains than the
        # model was trained on.
        config["dataset_reader"]["domain_identifier"] = domain
        prefix = f"{domain}_{prefix}"
    else:
        config["dataset_reader"].pop("domain_identifier", None)

    dataset_reader = DatasetReader.from_params(config['dataset_reader'])
    evaluation_data_path = data if data else config['validation_data_path']

    archive = load_archive(os.path.join(serialization_directory, "model.tar.gz"), cuda_device=device)
    model = archive.model
    model.eval()

    prediction_file_path = os.path.join(serialization_directory, prefix + "_predictions.txt")
    gold_file_path = os.path.join(serialization_directory, prefix + "_gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("reading evaluation data from {}".format(evaluation_data_path))
    instances = dataset_reader.read(evaluation_data_path)

    with torch.autograd.no_grad():
        iterator = BasicIterator(batch_size=32)
        iterator.index_with(model.vocab)

        model_predictions = []
        batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device)
        for batch in Tqdm.tqdm(batches):
            result = model(**batch)
            predictions = model.decode(result)
            model_predictions.extend(predictions["tags"])

        for instance, prediction in zip(instances, model_predictions):
            fields = instance.fields
            try:
                # Most sentences have a verbal predicate, but not all.
                verb_index = fields["verb_indicator"].labels.index(1)
            except ValueError:
                verb_index = None

            gold_tags = fields["tags"].labels
            sentence = [x.text for x in fields["tokens"].tokens]

            write_to_conll_eval_file(prediction_file, gold_file,
                                     verb_index, sentence, prediction, gold_tags)
        prediction_file.close()
        gold_file.close()
Ejemplo n.º 9
0
def main(serialization_directory: int,
         device: int,
         data: str,
         prefix: str,
         domain: str = None):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    data: str, default = None
        The data to evaluate on. By default, we use the validation data from
        the original experiment.
    prefix: str, default=""
        The prefix to prepend to the generated gold and prediction files, to distinguish
        different models/data.
    domain: str, optional (default = None)
        If passed, filters the ontonotes evaluation/test dataset to only contain the
        specified domain. This overwrites the domain in the config file from the model,
        to allow evaluation on domains other than the one the model was trained on.
    """
    config = Params.from_file(
        os.path.join(serialization_directory, "config.json"))

    if domain is not None:
        # Hack to allow evaluation on different domains than the
        # model was trained on.
        config["dataset_reader"]["domain_identifier"] = domain
        prefix = f"{domain}_{prefix}"
    else:
        config["dataset_reader"].pop("domain_identifier", None)

    dataset_reader = DatasetReader.from_params(config["dataset_reader"])
    evaluation_data_path = data if data else config["validation_data_path"]

    archive = load_archive(os.path.join(serialization_directory,
                                        "model.tar.gz"),
                           cuda_device=device)
    model = archive.model
    model.eval()

    prediction_file_path = os.path.join(serialization_directory,
                                        prefix + "_predictions.txt")
    gold_file_path = os.path.join(serialization_directory,
                                  prefix + "_gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("reading evaluation data from {}".format(evaluation_data_path))
    instances = dataset_reader.read(evaluation_data_path)

    with torch.autograd.no_grad():
        iterator = BasicIterator(batch_size=32)
        iterator.index_with(model.vocab)

        model_predictions = []
        batches = iterator(instances, num_epochs=1, shuffle=False)
        for batch in Tqdm.tqdm(batches):
            batch = move_to_device(batch, device)
            result = model(**batch)
            predictions = model.decode(result)
            model_predictions.extend(predictions["tags"])

        for instance, prediction in zip(instances, model_predictions):
            fields = instance.fields
            verb_index = fields["metadata"]["verb_index"]
            gold_tags = fields["metadata"]["gold_tags"]
            sentence = fields["metadata"]["words"]
            write_to_conll_eval_file(prediction_file, gold_file, verb_index,
                                     sentence, prediction, gold_tags)
        prediction_file.close()
        gold_file.close()
Ejemplo n.º 10
0
def srl_conll_evaluate(pred_file: str,
                       gold_file: str,
                       silent: bool = False,
                       min_length=0):
    """
    Evaluate current model using CoNLL script.

      Args:
        preds: contains the predictions from the model.
      Returns:
        f1 score
        :param silent:
        :param gold_file:
        :param pred_file:

    """
    pred_data = load_srl_data(pred_file)
    gold_data = load_srl_data(gold_file)
    if min_length > 0:
        pred_conll_file = pred_file + ".conll_" + str(min_length)
        gold_conll_file = gold_file + ".conll_" + str(min_length)
    else:
        pred_conll_file = pred_file + ".conll"
        gold_conll_file = gold_file + ".conll"
    assert len(pred_data) == len(gold_data)
    with open(pred_conll_file,
              mode='w') as pred_conll, open(gold_conll_file,
                                            mode='w') as gold_conll:
        for gold, pred in zip(gold_data, pred_data):
            # fields = instance.fields
            try:
                # Most sentences have a verbal predicate, but not all.
                verb_index = gold["target_verb_position"]
            except ValueError:
                verb_index = None

            gold_tags = gold["tags"]
            pred_tags = pred["tags"]
            sentence = gold["words"]
            if min_length > 0:
                if len(sentence) < min_length:
                    continue
            write_to_conll_eval_file(pred_conll, gold_conll, verb_index,
                                     sentence, pred_tags, gold_tags)

    with tempfile.NamedTemporaryFile(mode='r', delete=True) as scores:
        eval_script = "gcd/metrics/srl_perl/bin/srl-eval.pl"
        eval_lib = "gcd/metrics/srl_perl/lib"
        scores_path = scores.name
        # command = f"perl -I {eval_lib} {eval_script} {gold_conll_file} {pred_conll_file} > {scores_path}"
        command = "perl -I %s %s %s %s > %s" % (eval_lib, eval_script,
                                                gold_conll_file,
                                                pred_conll_file, scores_path)
        # print("running", command)
        os.system(command)
        result = scores.read().split('\n')
        # print(result)
        if not silent:
            for r in result:
                print(r)
        """
        Number of Sentences    :        3248
        Number of Propositions :        3221
        Percentage of perfect props :  68.89

                      corr.  excess  missed    prec.    rec.      F1
        ------------------------------------------------------------
           Overall     4810     997    1081    82.83   81.65   82.24
        ----------
                A0     1803     287     268    86.27   87.06   86.66
                A1     2448     521     525    82.45   82.34   82.40
                A2      450     163     218    73.41   67.37   70.26
                A3       67      16      45    80.72   59.82   68.72
                A4       41      10      24    80.39   63.08   70.69
                A5        1       0       1   100.00   50.00   66.67
        ------------------------------------------------------------
        ------------------------------------------------------------
        """
        conll_f1 = float(result[6].strip().split()[-1])
        perfect_props_percent = float(result[2].strip().split(':')[-1])
        label_f1s = {}
        for r in result[8:]:
            try:
                label_f1 = float(r.strip().split()[-1])
                label = r.strip().split()[0]
                label_f1s[label] = label_f1
            except ValueError:
                break
        return {
            "conll_f1": conll_f1,
            "perfect_props_percent": perfect_props_percent,
            "label_f1s": label_f1s
        }