def write_results(config: configure_finetuning.FinetuningConfig, results): """Write evaluation metrics to disk.""" utils.log("Writing results to", config.results_txt) utils.mkdir(config.results_txt.rsplit("/", 1)[0]) utils.write_pickle(results, config.results_pkl) with tf.io.gfile.GFile(config.results_txt, "a") as f: results_str = "" for trial_results in results: for task_name, task_results in trial_results.items(): if task_name == "time" or task_name == "global_step": continue results_str += task_name + ": " + " - ".join([ "{:}: {:.2f}".format(k, v) for k, v in task_results.items() ]) + "\n" # Neptune Metric Logging neptune.append_tag('ft') neptune.append_tag('tensorflow') neptune.set_property('task', task_name) for k, v in task_results.items(): neptune.log_metric(k, v) f.write(results_str) utils.write_pickle(results, config.results_pkl)
def write_classification_outputs(self, tasks, trial, split): """Write classification predictions to disk.""" utils.log("Writing out predictions for", tasks, split) predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split) results = self._estimator.predict(input_fn=predict_input_fn, yield_single_examples=True) # task name -> eid -> model-logits logits = collections.defaultdict(dict) a = [] for r in results: if r["task_id"] != len(self._tasks): r = utils.nest_dict(r, self._config.task_names) task_name = self._config.task_names[r["task_id"]] # logits[task_name][r[task_name]["eid"]] = ( # r[task_name]["logits"] if "logits" in r[task_name] # else r[task_name]["predictions"]) logits[task_name][r[task_name]["eid"]] = { 'logits': r[task_name]["logits"], 'prediction': r[task_name]["predictions"] } for task_name in logits: utils.log("Pickling predictions for {:} {:} examples ({:})".format( len(logits[task_name]), task_name, split)) if trial <= self._config.n_writes_test: utils.write_pickle( logits[task_name], self._config.test_predictions(task_name, split, trial))
def _get_label_mapping(self, provided_split=None, provided_sentences=None): if self._label_mapping is not None: return self._label_mapping if tf.io.gfile.exists(self._label_mapping_path): self._label_mapping = utils.load_pickle(self._label_mapping_path) return self._label_mapping utils.log("Writing label mapping for task", self.name) tag_counts = collections.Counter() train_tags = set() for split in ["train", "dev", "eval"]: if not tf.io.gfile.exists( os.path.join(self.config.raw_data_dir(self.name), split + ".json")): continue if split == provided_split: split_sentences = provided_sentences else: split_sentences, _id = self._get_labeled_sentences(split) for _w, tags, _t in split_sentences: for tag in tags: tag_counts[tag] += 1 if provided_split == "train": train_tags.add(tag) labels = sorted(tag_counts.keys()) label_mapping = {label: i for i, label in enumerate(labels)} utils.write_pickle(label_mapping, self._label_mapping_path) self._label_mapping = label_mapping return label_mapping
def _get_label_mapping(self, provided_split=None, provided_sentences=None): # import pdb; pdb.set_trace() # IBO if self._label_mapping is not None: return self._label_mapping if tf.io.gfile.exists(self._label_mapping_path): self._label_mapping = utils.load_pickle(self._label_mapping_path) return self._label_mapping utils.log("Writing label mapping for task", self.name) tag_counts = collections.Counter() train_tags = set() for split in ["train", "dev", "test"]: if not tf.io.gfile.exists( os.path.join(self.config.raw_data_dir(self.name), split + ".txt")): continue if split == provided_split: split_sentences = provided_sentences else: split_sentences = self._get_labeled_sentences(split) for _, tags in split_sentences: if not self._is_token_level: span_labels = tagging_utils.get_span_labels(tags) tags = tagging_utils.get_tags(span_labels, len(tags), LABEL_ENCODING) for tag in tags: tag_counts[tag] += 1 if provided_split == "train": train_tags.add(tag) if self.name == "ccg": infrequent_tags = [] for tag in tag_counts: if tag not in train_tags: infrequent_tags.append(tag) label_mapping = { label: i for i, label in enumerate( sorted( filter(lambda t: t not in infrequent_tags, tag_counts.keys()))) } n = len(label_mapping) for tag in infrequent_tags: label_mapping[tag] = n else: labels = sorted(tag_counts.keys()) label_mapping = {label: i for i, label in enumerate(labels)} utils.write_pickle(label_mapping, self._label_mapping_path) self._label_mapping = label_mapping return label_mapping
def write_results(config: configure_finetuning.FinetuningConfig, results): """Write evaluation metrics to disk.""" utils.log("Writing results to", config.results_txt) utils.mkdir(config.results_txt.rsplit("/", 1)[0]) utils.write_pickle(results, config.results_pkl) with tf.io.gfile.GFile(config.results_txt, "w") as f: results_str = "" for trial_results in results: for task_name, task_results in trial_results.items(): if task_name == "time" or task_name == "global_step": continue results_str += task_name + ": " + " - ".join( ["{:}: {:.2f}".format(k, v) for k, v in task_results.items()]) + "\n" f.write(results_str) utils.write_pickle(results, config.results_pkl)
def vote1(dataset, all_nbest, all_odds, qid_answers, split, output_dir): bagging_preds = collections.OrderedDict() bagging_odds = collections.OrderedDict() bagging_all_nbest = collections.OrderedDict() for qid in qid_answers: bagging_preds[qid] = \ (seq([nbest[qid][0] for nbest in all_nbest]).sorted(key=lambda x: x['probability'])).list()[-1]['text'] bagging_all_nbest[qid] = \ [(seq([nbest[qid][0] for nbest in all_nbest]).sorted(key=lambda x: x['probability'])).list()[-1]] bagging_odds[qid] = np.mean([odds[qid] for odds in all_odds]) utils.write_json( bagging_preds, os.path.join(output_dir, 'vote1', 'ccks42bagging_{}_preds.json'.format(split))) utils.write_pickle( bagging_all_nbest, os.path.join(output_dir, 'vote1', 'ccks42bagging_{}_all_nbest.pkl'.format(split))) utils.write_json( bagging_odds, os.path.join(output_dir, 'vote1', 'ccks42bagging_{}_null_odds.json'.format(split))) if split in ['train', 'dev']: out_eval = main2(dataset, bagging_preds, bagging_odds) utils.log('vote1') utils.log(out_eval) elif split == 'eval': for qid in bagging_preds.keys(): if bagging_odds[qid] > -2.75: bagging_preds[qid] = "" utils.write_json( bagging_preds, os.path.join(output_dir, 'vote1', 'ccks42bagging_{}_1_preds.json'.format(split))) else: utils.log('{} split is not supported'.format(split))
def write_predictions(self): """Write final predictions to the json file.""" unique_id_to_result = {} for result in self._all_results: unique_id_to_result[result["unique_id"]] = result results = {} total_loss = 0. for example in self._eval_examples: example_id = example.qas_id if "squad" in self._name else example.qid features = self._task.featurize(example, False, for_eval=True) results[example_id] = [] for (feature_index, feature) in enumerate(features): result = unique_id_to_result[feature[self._name + "_eid"]] result['targets'] = feature[self._name + "_f1_score"] total_loss += (result['targets'] - result['predictions'])**2 results[example_id].append(result) total_loss /= len(results) utils.write_pickle(results, self._config.f1_predict_results_file) utils.log(f"total_loss: {total_loss}")
def write_predictions(self): """Write final predictions to the json file.""" unique_id_to_result = {} for result in self._all_results: unique_id_to_result[result.unique_id] = result _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", [ "feature_index", "start_index", "end_index", "start_logit", "end_logit" ]) all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() for example in self._eval_examples: example_id = example.qas_id if "squad" in self._name else example.qid features = self._task.featurize(example, False, for_eval=True) prelim_predictions = [] # keep track of the minimum score of null start+end of position 0 score_null = 1000000 # large and positive for (feature_index, feature) in enumerate(features): result = unique_id_to_result[feature[self._name + "_eid"]] if self._config.joint_prediction: start_indexes = result.start_top_index end_indexes = result.end_top_index else: start_indexes = _get_best_indexes(result.start_logits, self._config.n_best_size) end_indexes = _get_best_indexes(result.end_logits, self._config.n_best_size) # if we could have irrelevant answers, get the min score of irrelevant if self._v2: if self._config.answerable_classifier: feature_null_score = result.answerable_logit else: feature_null_score = result.start_logits[ 0] + result.end_logits[0] if feature_null_score < score_null: score_null = feature_null_score for i, start_index in enumerate(start_indexes): for j, end_index in enumerate( end_indexes[i] if self._config. joint_prediction else end_indexes): # We could hypothetically create invalid predictions, e.g., predict # that the start of the span is in the question. We throw out all # invalid predictions. if start_index >= len(feature[self._name + "_tokens"]): continue if end_index >= len(feature[self._name + "_tokens"]): continue if start_index == 0: continue if start_index not in feature[self._name + "_token_to_orig_map"]: continue if end_index not in feature[self._name + "_token_to_orig_map"]: continue if not feature[self._name + "_token_is_max_context"].get( start_index, False): continue if end_index < start_index: continue length = end_index - start_index + 1 if length > self._config.max_answer_length: continue start_logit = (result.start_top_log_probs[i] if self._config.joint_prediction else result.start_logits[start_index]) end_logit = (result.end_top_log_probs[i, j] if self._config.joint_prediction else result.end_logits[end_index]) prelim_predictions.append( _PrelimPrediction(feature_index=feature_index, start_index=start_index, end_index=end_index, start_logit=start_logit, end_logit=end_logit)) if self._v2: if len(prelim_predictions) == 0 and self._config.debug: tokid = sorted(feature[self._name + "_token_to_orig_map"].keys())[0] prelim_predictions.append( _PrelimPrediction(feature_index=0, start_index=tokid, end_index=tokid + 1, start_logit=1.0, end_logit=1.0)) prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name "NbestPrediction", ["text", "start_logit", "end_logit"]) seen_predictions = {} nbest = [] for pred in prelim_predictions: if len(nbest) >= self._config.n_best_size: break feature = features[pred.feature_index] tok_tokens = feature[self._name + "_tokens"][pred.start_index:( pred.end_index + 1)] orig_doc_start = feature[self._name + "_token_to_orig_map"][ pred.start_index] orig_doc_end = feature[self._name + "_token_to_orig_map"][pred.end_index] orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. tok_text = tok_text.replace(" ##", "") tok_text = tok_text.replace("##", "") # Clean whitespace tok_text = tok_text.strip() tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) final_text = get_final_text(self._config, tok_text, orig_text) if final_text in seen_predictions: continue seen_predictions[final_text] = True nbest.append( _NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append( _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) assert len(nbest) >= 1 total_scores = [] best_non_null_entry = None for entry in nbest: total_scores.append(entry.start_logit + entry.end_logit) if not best_non_null_entry: if entry.text: best_non_null_entry = entry probs = _compute_softmax(total_scores) nbest_json = [] for (i, entry) in enumerate(nbest): output = collections.OrderedDict() output["text"] = entry.text output["probability"] = probs[i] output["start_logit"] = entry.start_logit output["end_logit"] = entry.end_logit nbest_json.append(dict(output)) assert len(nbest_json) >= 1 if not self._v2: all_predictions[example_id] = nbest_json[0]["text"] else: # predict "" iff the null score - the score of best non-null > threshold if self._config.answerable_classifier: score_diff = score_null else: score_diff = score_null - best_non_null_entry.start_logit - ( best_non_null_entry.end_logit) scores_diff_json[example_id] = score_diff all_predictions[example_id] = best_non_null_entry.text all_nbest_json[example_id] = nbest_json utils.write_json(dict(all_predictions), self._config.qa_preds_file(self._name)) utils.write_pickle(all_nbest_json, self._config.eval_all_nbest_file) if self._v2: utils.write_json( {k: float(v) for k, v in six.iteritems(scores_diff_json)}, self._config.qa_na_file(self._name))