Esempio n. 1
0
    def evaluate_task_test(self,
                           task,
                           checkpoint,
                           split="test",
                           return_results=True):
        """Evaluate the current model."""
        utils.log("Testing ", task.name)
        eval_input_fn, _ = self._preprocessor.prepare_predict([task], split)

        results = self._estimator.predict(input_fn=eval_input_fn,
                                          yield_single_examples=True,
                                          checkpoint_path=checkpoint)
        scorer = task.get_scorer()
        for r in results:
            if r["task_id"] != len(self._tasks):  # ignore padding examples
                r = utils.nest_dict(r, self._config.task_names)
                scorer.update(r[task.name])
        scores = dict(scorer.get_results())

        if return_results:
            utils.log("test_results " + task.name + ": " +
                      " - ".join("{}: {}".format(k, v)
                                 for k, v in scores.items()))
            return scores
        else:
            return scorer
 def write_classification_outputs(self, tasks, trial, split):
     """Write classification predictions to disk."""
     utils.log("Writing out predictions for", tasks, split)
     predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split)
     results = self._estimator.predict(input_fn=predict_input_fn,
                                       yield_single_examples=True)
     # task name -> eid -> model-logits
     logits = collections.defaultdict(dict)
     a = []
     for r in results:
         if r["task_id"] != len(self._tasks):
             r = utils.nest_dict(r, self._config.task_names)
             task_name = self._config.task_names[r["task_id"]]
             # logits[task_name][r[task_name]["eid"]] = (
             #     r[task_name]["logits"] if "logits" in r[task_name]
             #     else r[task_name]["predictions"])
             logits[task_name][r[task_name]["eid"]] = {
                 'logits': r[task_name]["logits"],
                 'prediction': r[task_name]["predictions"]
             }
     for task_name in logits:
         utils.log("Pickling predictions for {:} {:} examples ({:})".format(
             len(logits[task_name]), task_name, split))
         if trial <= self._config.n_writes_test:
             utils.write_pickle(
                 logits[task_name],
                 self._config.test_predictions(task_name, split, trial))
Esempio n. 3
0
    def write_classification_outputs(
            self, tasks, trial, split,
            config: configure_finetuning.FinetuningConfig):
        """Write classification predictions to disk."""
        utils.log("Writing out predictions for", tasks, split)
        predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split)
        results = self._estimator.predict(input_fn=predict_input_fn,
                                          yield_single_examples=True)
        # task name -> eid -> model-logits
        logits = collections.defaultdict(dict)
        for r in results:
            if r["task_id"] != len(self._tasks):
                r = utils.nest_dict(r, self._config.task_names)
                task_name = self._config.task_names[r["task_id"]]
                logits[task_name][r[task_name]["eid"]] = (
                    r[task_name]["eid"],
                    r[task_name]["input_ids"],
                    r[task_name]["input_mask"],
                    r[task_name]["token_type_ids"],
                    r[task_name]["logits"]
                    if "logits" in r[task_name] else None,
                    r[task_name]["predictions"],
                    r[task_name]["label_ids"] if "label_ids" in r[task_name]
                    else r[task_name]['targets'],
                )

        print('[RESULT]')

        tokenizer = tokenization.FullTokenizer(
            vocab_file=config.vocab_file, do_lower_case=config.do_lower_case)

        for task_name in logits:
            utils.log(
                "Saving Dev Error Analysis for {:} {:} examples ({:})".format(
                    len(logits[task_name]), task_name, split))
            if trial <= self._config.n_writes_test:
                print('Write to: ' +
                      self._config.dev_analysis(task_name, split, trial))
                with open(self._config.dev_analysis(task_name, split, trial),
                          'w',
                          encoding='utf-8') as fout:
                    fout.write('ID\tINPUT\tLOGITS\tPREDICTION\tLABEL\n')
                    for eid in logits[task_name]:
                        print('=>' + str(eid))
                        (_, input_id, input_mask, token_type_id, logit,
                         prediction, label_id) = logits[task_name][eid]
                        input_tokens = tokenizer.convert_ids_to_tokens(
                            input_id)
                        input_tokens = filter(lambda x: x != '[PAD]',
                                              input_tokens)
                        input_tokens = ' '.join(input_tokens)

                        fout.write(
                            str(eid) + '\t' + str(input_tokens) + '\t' +
                            str(logit) + '\t' + str(prediction) + '\t' +
                            str(label_id) + '\n')
                        print('Inputs: ' + str(input_tokens) + ', Logits: ' +
                              str(logit) + ', Predictions: ' +
                              str(prediction) + ', Labels: ' + str(label_id))
Esempio n. 4
0
    def evaluate_task(self, task, split="dev", return_results=True):
        """Evaluate the current model."""
        utils.log("Evaluating", task.name)
        eval_input_fn, _ = self._preprocessor.prepare_predict([task], split)
        results = self._estimator.predict(input_fn=eval_input_fn,
                                          yield_single_examples=True)

        eval_examples = task.get_examples(split)

        scorer = task.get_scorer()
        for r in results:
            if r["task_id"] != len(self._tasks):  # ignore padding examples
                r = utils.nest_dict(r, self._config.task_names)
                scorer.update(r[task.name])
        scorer.write_predictions()
Esempio n. 5
0
 def evaluate_task(self, task, split="dev", return_results=True):
     """Evaluate the current model."""
     utils.log("Evaluating", task.name)
     eval_input_fn, _ = self._preprocessor.prepare_predict([task], split)
     checkpoints = sorted([
         f for f in tf.gfile.ListDirectory(self._config.model_dir)
         if f[-6:] == ".index"
     ],
                          key=lambda x: int(x[11:-6]))
     checkpoints = [
         os.path.join(self._config.model_dir, checkpoint[:-6])
         for checkpoint in checkpoints
     ]
     best_scores = None
     best_scorer = None
     key = self._config.eval_key
     for checkpoint in checkpoints:
         if int(checkpoint.split("-")[-1]) == 0: continue
         results = self._estimator.predict(input_fn=eval_input_fn,
                                           yield_single_examples=True,
                                           checkpoint_path=checkpoint)
         scorer = task.get_scorer()
         for r in results:
             if r["task_id"] != len(self._tasks):  # ignore padding examples
                 r = utils.nest_dict(r, self._config.task_names)
                 scorer.update(r[task.name])
         scores = dict(scorer.get_results())
         scores["checkpoint_path"] = checkpoint
         if return_results:
             utils.log(task.name + ": " +
                       " - ".join("{}: {}".format(k, v)
                                  for k, v in scores.items()))
             utils.log()
             if key is None or best_scores is None or scores[
                     key] > best_scores[key]:
                 best_scores = scores
         else:
             if key is None or best_scores is None or scores[
                     key] > best_scores[key]:
                 best_scores = scores
                 best_scorer = scorer
     if return_results:
         utils.log("eval_results " + task.name + ": " +
                   " - ".join("{}: {}".format(k, v)
                              for k, v in best_scores.items()))
         return best_scores
     else:
         return best_scorer
Esempio n. 6
0
    def write_tagging_outputs(self, tasks, trial, split):
        """Write classification predictions to disk."""
        utils.log("Writing out predictions for", tasks, split)
        predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split)
        results = self._estimator.predict(input_fn=predict_input_fn,
                                          yield_single_examples=True)

        # task name -> eid -> model-logits
        labels = collections.defaultdict(dict)
        predictions = collections.defaultdict(dict)
        length = collections.defaultdict(dict)
        for r in results:
            if r["task_id"] != len(self._tasks):
                r = utils.nest_dict(r, self._config.task_names)
                task_name = self._config.task_names[r["task_id"]]
                predictions[task_name][r[task_name]
                                       ["eid"]] = r[task_name]["predictions"]
                labels[task_name][r[task_name]["eid"]] = r[task_name]["labels"]
                length[task_name][r[task_name]["eid"]] = np.sum(
                    r[task_name]["labels_mask"])
        for task_name in predictions:
            utils.log("Pickling predictions for {:} {:} examples ({:})".format(
                len(predictions[task_name]), task_name, split))
            if trial <= self._config.n_writes_test:
                preds_file = self._config.test_predictions(
                    task_name, split, trial) + "_pred.txt"
                label_file = self._config.test_predictions(
                    task_name, split, trial) + "_label.txt"
                task_preds = predictions[task_name]
                task_labels = labels[task_name]
                task_length = length[task_name]
                num_ex = len(task_preds)
                if "/" in preds_file:
                    tf.io.gfile.makedirs(preds_file.rsplit("/", 1)[0])
                with tf.io.gfile.GFile(preds_file,
                                       "w") as fpred, tf.io.gfile.GFile(
                                           label_file, "w") as flabel:
                    for i in range(num_ex):
                        l = int(task_length[i])
                        p = np.array(map(str, task_preds[i][0:l])).tolist()
                        l = np.array(map(str, task_labels[i][0:l])).tolist()
                        fpred.write("{}\n".format(' '.join(p)))
                        flabel.write("{}\n".format(' '.join(l)))
Esempio n. 7
0
 def evaluate_task(self, task, split="dev", return_results=True):
     """Evaluate the current model."""
     utils.log("Evaluating", task.name, split)
     eval_input_fn, _ = self._preprocessor.prepare_predict([task], split)
     results = self._estimator.predict(input_fn=eval_input_fn,
                                       yield_single_examples=True)
     if task.name == "cmrc2018" or task.name == "drcd":
         scorer = task.get_scorer(split)
     else:
         scorer = task.get_scorer()
     for r in results:
         if r["task_id"] != len(self._tasks):  # ignore padding examples
             r = utils.nest_dict(r, self._config.task_names)
             scorer.update(r[task.name])
     if return_results:
         utils.log(task.name + ": " + scorer.results_str())
         utils.log()
         return dict(scorer.get_results())
     else:
         return scorer
Esempio n. 8
0
 def evaluate_task(self, task, split="dev", return_results=True):
     """Evaluate the current model."""
     utils.log("Evaluating", task.name, split)
     eval_input_fn, _ = self._preprocessor.prepare_predict([task], split)
     results = self._estimator.predict(input_fn=eval_input_fn,
                                       yield_single_examples=True)
     if task.name in [
             "squad", "squadv1", "newsqa", "naturalqs", "triviaqa",
             "searchqa", "cmrc2018", "drcd", "ccks42ec", "ccks42ee",
             "ccks42single", "ccks42multi", "ner", "ccks42num", "ccks42reg"
     ]:
         scorer = task.get_scorer(split)
     else:
         scorer = task.get_scorer()
     for r in results:
         if r["task_id"] != len(self._tasks):  # ignore padding examples
             r = utils.nest_dict(r, self._config.task_names)
             scorer.update(r[task.name])
     if return_results:
         utils.log(task.name + ": " + scorer.results_str())
         utils.log()
         return dict(scorer.get_results())
     else:
         return scorer