Example #1
0
def write_results(config: configure_finetuning.FinetuningConfig, results):
    """Write evaluation metrics to disk."""
    utils.log("Writing results to", config.results_txt)
    utils.mkdir(config.results_txt.rsplit("/", 1)[0])
    utils.write_pickle(results, config.results_pkl)
    with tf.io.gfile.GFile(config.results_txt, "a") as f:
        results_str = ""
        for trial_results in results:
            for task_name, task_results in trial_results.items():
                if task_name == "time" or task_name == "global_step":
                    continue
                results_str += task_name + ": " + " - ".join([
                    "{:}: {:.2f}".format(k, v)
                    for k, v in task_results.items()
                ]) + "\n"

                # Neptune Metric Logging
                neptune.append_tag('ft')
                neptune.append_tag('tensorflow')
                neptune.set_property('task', task_name)
                for k, v in task_results.items():
                    neptune.log_metric(k, v)

        f.write(results_str)
    utils.write_pickle(results, config.results_pkl)
 def write_classification_outputs(self, tasks, trial, split):
     """Write classification predictions to disk."""
     utils.log("Writing out predictions for", tasks, split)
     predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split)
     results = self._estimator.predict(input_fn=predict_input_fn,
                                       yield_single_examples=True)
     # task name -> eid -> model-logits
     logits = collections.defaultdict(dict)
     a = []
     for r in results:
         if r["task_id"] != len(self._tasks):
             r = utils.nest_dict(r, self._config.task_names)
             task_name = self._config.task_names[r["task_id"]]
             # logits[task_name][r[task_name]["eid"]] = (
             #     r[task_name]["logits"] if "logits" in r[task_name]
             #     else r[task_name]["predictions"])
             logits[task_name][r[task_name]["eid"]] = {
                 'logits': r[task_name]["logits"],
                 'prediction': r[task_name]["predictions"]
             }
     for task_name in logits:
         utils.log("Pickling predictions for {:} {:} examples ({:})".format(
             len(logits[task_name]), task_name, split))
         if trial <= self._config.n_writes_test:
             utils.write_pickle(
                 logits[task_name],
                 self._config.test_predictions(task_name, split, trial))
Example #3
0
 def _get_label_mapping(self, provided_split=None, provided_sentences=None):
     if self._label_mapping is not None:
         return self._label_mapping
     if tf.io.gfile.exists(self._label_mapping_path):
         self._label_mapping = utils.load_pickle(self._label_mapping_path)
         return self._label_mapping
     utils.log("Writing label mapping for task", self.name)
     tag_counts = collections.Counter()
     train_tags = set()
     for split in ["train", "dev", "eval"]:
         if not tf.io.gfile.exists(
                 os.path.join(self.config.raw_data_dir(self.name),
                              split + ".json")):
             continue
         if split == provided_split:
             split_sentences = provided_sentences
         else:
             split_sentences, _id = self._get_labeled_sentences(split)
         for _w, tags, _t in split_sentences:
             for tag in tags:
                 tag_counts[tag] += 1
                 if provided_split == "train":
                     train_tags.add(tag)
     labels = sorted(tag_counts.keys())
     label_mapping = {label: i for i, label in enumerate(labels)}
     utils.write_pickle(label_mapping, self._label_mapping_path)
     self._label_mapping = label_mapping
     return label_mapping
Example #4
0
 def _get_label_mapping(self, provided_split=None, provided_sentences=None):
     # import pdb; pdb.set_trace() # IBO
     if self._label_mapping is not None:
         return self._label_mapping
     if tf.io.gfile.exists(self._label_mapping_path):
         self._label_mapping = utils.load_pickle(self._label_mapping_path)
         return self._label_mapping
     utils.log("Writing label mapping for task", self.name)
     tag_counts = collections.Counter()
     train_tags = set()
     for split in ["train", "dev", "test"]:
         if not tf.io.gfile.exists(
                 os.path.join(self.config.raw_data_dir(self.name),
                              split + ".txt")):
             continue
         if split == provided_split:
             split_sentences = provided_sentences
         else:
             split_sentences = self._get_labeled_sentences(split)
         for _, tags in split_sentences:
             if not self._is_token_level:
                 span_labels = tagging_utils.get_span_labels(tags)
                 tags = tagging_utils.get_tags(span_labels, len(tags),
                                               LABEL_ENCODING)
             for tag in tags:
                 tag_counts[tag] += 1
                 if provided_split == "train":
                     train_tags.add(tag)
     if self.name == "ccg":
         infrequent_tags = []
         for tag in tag_counts:
             if tag not in train_tags:
                 infrequent_tags.append(tag)
         label_mapping = {
             label: i
             for i, label in enumerate(
                 sorted(
                     filter(lambda t: t not in infrequent_tags,
                            tag_counts.keys())))
         }
         n = len(label_mapping)
         for tag in infrequent_tags:
             label_mapping[tag] = n
     else:
         labels = sorted(tag_counts.keys())
         label_mapping = {label: i for i, label in enumerate(labels)}
     utils.write_pickle(label_mapping, self._label_mapping_path)
     self._label_mapping = label_mapping
     return label_mapping
Example #5
0
def write_results(config: configure_finetuning.FinetuningConfig, results):
  """Write evaluation metrics to disk."""
  utils.log("Writing results to", config.results_txt)
  utils.mkdir(config.results_txt.rsplit("/", 1)[0])
  utils.write_pickle(results, config.results_pkl)
  with tf.io.gfile.GFile(config.results_txt, "w") as f:
    results_str = ""
    for trial_results in results:
      for task_name, task_results in trial_results.items():
        if task_name == "time" or task_name == "global_step":
          continue
        results_str += task_name + ": " + " - ".join(
            ["{:}: {:.2f}".format(k, v)
             for k, v in task_results.items()]) + "\n"
    f.write(results_str)
  utils.write_pickle(results, config.results_pkl)
Example #6
0
def vote1(dataset, all_nbest, all_odds, qid_answers, split, output_dir):
    bagging_preds = collections.OrderedDict()
    bagging_odds = collections.OrderedDict()
    bagging_all_nbest = collections.OrderedDict()

    for qid in qid_answers:
        bagging_preds[qid] = \
          (seq([nbest[qid][0] for nbest in all_nbest]).sorted(key=lambda x: x['probability'])).list()[-1]['text']
        bagging_all_nbest[qid] = \
          [(seq([nbest[qid][0] for nbest in all_nbest]).sorted(key=lambda x: x['probability'])).list()[-1]]
        bagging_odds[qid] = np.mean([odds[qid] for odds in all_odds])

    utils.write_json(
        bagging_preds,
        os.path.join(output_dir, 'vote1',
                     'ccks42bagging_{}_preds.json'.format(split)))
    utils.write_pickle(
        bagging_all_nbest,
        os.path.join(output_dir, 'vote1',
                     'ccks42bagging_{}_all_nbest.pkl'.format(split)))
    utils.write_json(
        bagging_odds,
        os.path.join(output_dir, 'vote1',
                     'ccks42bagging_{}_null_odds.json'.format(split)))

    if split in ['train', 'dev']:
        out_eval = main2(dataset, bagging_preds, bagging_odds)
        utils.log('vote1')
        utils.log(out_eval)
    elif split == 'eval':
        for qid in bagging_preds.keys():
            if bagging_odds[qid] > -2.75:
                bagging_preds[qid] = ""
        utils.write_json(
            bagging_preds,
            os.path.join(output_dir, 'vote1',
                         'ccks42bagging_{}_1_preds.json'.format(split)))
    else:
        utils.log('{} split is not supported'.format(split))
Example #7
0
    def write_predictions(self):
        """Write final predictions to the json file."""
        unique_id_to_result = {}
        for result in self._all_results:
            unique_id_to_result[result["unique_id"]] = result

        results = {}
        total_loss = 0.
        for example in self._eval_examples:
            example_id = example.qas_id if "squad" in self._name else example.qid
            features = self._task.featurize(example, False, for_eval=True)

            results[example_id] = []
            for (feature_index, feature) in enumerate(features):
                result = unique_id_to_result[feature[self._name + "_eid"]]
                result['targets'] = feature[self._name + "_f1_score"]

                total_loss += (result['targets'] - result['predictions'])**2

                results[example_id].append(result)
        total_loss /= len(results)

        utils.write_pickle(results, self._config.f1_predict_results_file)
        utils.log(f"total_loss: {total_loss}")
Example #8
0
    def write_predictions(self):
        """Write final predictions to the json file."""
        unique_id_to_result = {}
        for result in self._all_results:
            unique_id_to_result[result.unique_id] = result

        _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
            "PrelimPrediction", [
                "feature_index", "start_index", "end_index", "start_logit",
                "end_logit"
            ])

        all_predictions = collections.OrderedDict()
        all_nbest_json = collections.OrderedDict()
        scores_diff_json = collections.OrderedDict()

        for example in self._eval_examples:
            example_id = example.qas_id if "squad" in self._name else example.qid
            features = self._task.featurize(example, False, for_eval=True)

            prelim_predictions = []
            # keep track of the minimum score of null start+end of position 0
            score_null = 1000000  # large and positive
            for (feature_index, feature) in enumerate(features):
                result = unique_id_to_result[feature[self._name + "_eid"]]
                if self._config.joint_prediction:
                    start_indexes = result.start_top_index
                    end_indexes = result.end_top_index
                else:
                    start_indexes = _get_best_indexes(result.start_logits,
                                                      self._config.n_best_size)
                    end_indexes = _get_best_indexes(result.end_logits,
                                                    self._config.n_best_size)
                # if we could have irrelevant answers, get the min score of irrelevant
                if self._v2:
                    if self._config.answerable_classifier:
                        feature_null_score = result.answerable_logit
                    else:
                        feature_null_score = result.start_logits[
                            0] + result.end_logits[0]
                    if feature_null_score < score_null:
                        score_null = feature_null_score
                for i, start_index in enumerate(start_indexes):
                    for j, end_index in enumerate(
                            end_indexes[i] if self._config.
                            joint_prediction else end_indexes):
                        # We could hypothetically create invalid predictions, e.g., predict
                        # that the start of the span is in the question. We throw out all
                        # invalid predictions.
                        if start_index >= len(feature[self._name + "_tokens"]):
                            continue
                        if end_index >= len(feature[self._name + "_tokens"]):
                            continue
                        if start_index == 0:
                            continue
                        if start_index not in feature[self._name +
                                                      "_token_to_orig_map"]:
                            continue
                        if end_index not in feature[self._name +
                                                    "_token_to_orig_map"]:
                            continue
                        if not feature[self._name +
                                       "_token_is_max_context"].get(
                                           start_index, False):
                            continue
                        if end_index < start_index:
                            continue
                        length = end_index - start_index + 1
                        if length > self._config.max_answer_length:
                            continue
                        start_logit = (result.start_top_log_probs[i]
                                       if self._config.joint_prediction else
                                       result.start_logits[start_index])
                        end_logit = (result.end_top_log_probs[i, j]
                                     if self._config.joint_prediction else
                                     result.end_logits[end_index])
                        prelim_predictions.append(
                            _PrelimPrediction(feature_index=feature_index,
                                              start_index=start_index,
                                              end_index=end_index,
                                              start_logit=start_logit,
                                              end_logit=end_logit))

            if self._v2:
                if len(prelim_predictions) == 0 and self._config.debug:
                    tokid = sorted(feature[self._name +
                                           "_token_to_orig_map"].keys())[0]
                    prelim_predictions.append(
                        _PrelimPrediction(feature_index=0,
                                          start_index=tokid,
                                          end_index=tokid + 1,
                                          start_logit=1.0,
                                          end_logit=1.0))
            prelim_predictions = sorted(prelim_predictions,
                                        key=lambda x:
                                        (x.start_logit + x.end_logit),
                                        reverse=True)

            _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
                "NbestPrediction", ["text", "start_logit", "end_logit"])

            seen_predictions = {}
            nbest = []
            for pred in prelim_predictions:
                if len(nbest) >= self._config.n_best_size:
                    break
                feature = features[pred.feature_index]
                tok_tokens = feature[self._name + "_tokens"][pred.start_index:(
                    pred.end_index + 1)]
                orig_doc_start = feature[self._name + "_token_to_orig_map"][
                    pred.start_index]
                orig_doc_end = feature[self._name +
                                       "_token_to_orig_map"][pred.end_index]
                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
                                                                 1)]
                tok_text = " ".join(tok_tokens)

                # De-tokenize WordPieces that have been split off.
                tok_text = tok_text.replace(" ##", "")
                tok_text = tok_text.replace("##", "")

                # Clean whitespace
                tok_text = tok_text.strip()
                tok_text = " ".join(tok_text.split())
                orig_text = " ".join(orig_tokens)

                final_text = get_final_text(self._config, tok_text, orig_text)
                if final_text in seen_predictions:
                    continue

                seen_predictions[final_text] = True

                nbest.append(
                    _NbestPrediction(text=final_text,
                                     start_logit=pred.start_logit,
                                     end_logit=pred.end_logit))

            # In very rare edge cases we could have no valid predictions. So we
            # just create a nonce prediction in this case to avoid failure.
            if not nbest:
                nbest.append(
                    _NbestPrediction(text="empty",
                                     start_logit=0.0,
                                     end_logit=0.0))

            assert len(nbest) >= 1

            total_scores = []
            best_non_null_entry = None
            for entry in nbest:
                total_scores.append(entry.start_logit + entry.end_logit)
                if not best_non_null_entry:
                    if entry.text:
                        best_non_null_entry = entry

            probs = _compute_softmax(total_scores)

            nbest_json = []
            for (i, entry) in enumerate(nbest):
                output = collections.OrderedDict()
                output["text"] = entry.text
                output["probability"] = probs[i]
                output["start_logit"] = entry.start_logit
                output["end_logit"] = entry.end_logit
                nbest_json.append(dict(output))

            assert len(nbest_json) >= 1

            if not self._v2:
                all_predictions[example_id] = nbest_json[0]["text"]
            else:
                # predict "" iff the null score - the score of best non-null > threshold
                if self._config.answerable_classifier:
                    score_diff = score_null
                else:
                    score_diff = score_null - best_non_null_entry.start_logit - (
                        best_non_null_entry.end_logit)
                scores_diff_json[example_id] = score_diff
                all_predictions[example_id] = best_non_null_entry.text

            all_nbest_json[example_id] = nbest_json

        utils.write_json(dict(all_predictions),
                         self._config.qa_preds_file(self._name))
        utils.write_pickle(all_nbest_json, self._config.eval_all_nbest_file)

        if self._v2:
            utils.write_json(
                {k: float(v)
                 for k, v in six.iteritems(scores_diff_json)},
                self._config.qa_na_file(self._name))