def reduce_aggregated_logs(self, aggregated_logs): all_predictions, _, scores_diff = (self.squad_lib.postprocess_output( self._eval_examples, self._eval_features, aggregated_logs, self.task_config.n_best_size, self.task_config.max_answer_length, self.task_config.validation_data.do_lower_case, version_2_with_negative=( self.task_config.validation_data.version_2_with_negative), null_score_diff_threshold=( self.task_config.null_score_diff_threshold), verbose=False)) with tf.io.gfile.GFile(self.task_config.validation_data.input_path, 'r') as reader: dataset_json = json.load(reader) pred_dataset = dataset_json['data'] if self.task_config.validation_data.version_2_with_negative: eval_metrics = squad_evaluate_v2_0.evaluate( pred_dataset, all_predictions, scores_diff) else: eval_metrics = squad_evaluate_v1_1.evaluate( pred_dataset, all_predictions) return eval_metrics
def eval_squad(strategy, input_meta_data, tokenizer, bert_config, squad_lib, init_checkpoint=None): """Get prediction results and evaluate them against ground truth.""" if init_checkpoint is None: init_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir) all_predict_files = _get_matched_files(FLAGS.predict_file) if len(all_predict_files) != 1: raise ValueError('`eval_squad` only supports one predict file, ' 'but got %s' % all_predict_files) squad_model = get_squad_model_to_predict(strategy, bert_config, init_checkpoint, input_meta_data) all_predictions, all_nbest_json, scores_diff_json = prediction_output_squad( strategy, input_meta_data, tokenizer, squad_lib, all_predict_files[0], squad_model) dump_to_files(all_predictions, all_nbest_json, scores_diff_json, squad_lib, input_meta_data.get('version_2_with_negative', False)) with tf.io.gfile.GFile(FLAGS.predict_file, 'r') as reader: dataset_json = json.load(reader) pred_dataset = dataset_json['data'] if input_meta_data.get('version_2_with_negative', False): eval_metrics = squad_evaluate_v2_0.evaluate(pred_dataset, all_predictions, scores_diff_json) else: eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset, all_predictions) return eval_metrics
def eval_squad(strategy, input_meta_data, tokenizer, bert_config, squad_lib, init_checkpoint=None): """Get prediction results and evaluate them against ground truth.""" if init_checkpoint is None: init_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir) all_predictions, all_nbest_json, scores_diff_json = prediction_output_squad( strategy, input_meta_data, tokenizer, bert_config, squad_lib, init_checkpoint) dump_to_files(all_predictions, all_nbest_json, scores_diff_json, squad_lib, input_meta_data.get('version_2_with_negative', False)) with tf.io.gfile.GFile(FLAGS.predict_file, 'r') as reader: dataset_json = json.load(reader) pred_dataset = dataset_json['data'] if input_meta_data.get('version_2_with_negative', False): eval_metrics = squad_evaluate_v2_0.evaluate(pred_dataset, all_predictions, scores_diff_json) else: eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset, all_predictions) return eval_metrics
def evaluate(self, model, input_fn, num_steps, eval_examples, eval_features, predict_file, version_2_with_negative, max_answer_length, null_score_diff_threshold, verbose_logging, output_dir): """Evaluate QA model. Args: model: The model to be evaluated. input_fn: Function that returns a tf.data.Dataset used for evaluation. num_steps: Number of steps to evaluate the model. eval_examples: List of `squad_lib.SquadExample` for evaluation data. eval_features: List of `squad_lib.InputFeatures` for evaluation data. predict_file: The input predict file. version_2_with_negative: Whether the input predict file is SQuAD 2.0 format. max_answer_length: The maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another. null_score_diff_threshold: If null_score - best_non_null is greater than the threshold, predict null. This is only used for SQuAD v2. verbose_logging: If true, all of the warnings related to data processing will be printed. A number of warnings are expected for a normal SQuAD evaluation. output_dir: The output directory to save output to json files: predictions.json, nbest_predictions.json, null_odds.json. If None, skip saving to json files. Returns: A dict contains two metrics: Exact match rate and F1 score. """ all_results = self.predict(model, input_fn, num_steps) all_predictions, all_nbest_json, scores_diff_json = ( squad_lib.postprocess_output( eval_examples, eval_features, all_results, n_best_size=20, max_answer_length=max_answer_length, do_lower_case=self.do_lower_case, version_2_with_negative=version_2_with_negative, null_score_diff_threshold=null_score_diff_threshold, verbose=verbose_logging)) if output_dir is not None: dump_to_files(all_predictions, all_nbest_json, scores_diff_json, version_2_with_negative, output_dir) dataset_json = file_util.load_json_file(predict_file) pred_dataset = dataset_json['data'] if version_2_with_negative: eval_metrics = squad_evaluate_v2_0.evaluate( pred_dataset, all_predictions, scores_diff_json) else: eval_metrics = squad_evaluate_v1_1.evaluate( pred_dataset, all_predictions) return eval_metrics
def reduce_aggregated_logs(self, aggregated_logs): all_predictions, _, scores_diff = (self.squad_lib.postprocess_output( self._eval_examples, self._eval_features, aggregated_logs, self.task_config.n_best_size, self.task_config.max_answer_length, self.task_config.validation_data.do_lower_case, version_2_with_negative=( self.task_config.validation_data.version_2_with_negative), null_score_diff_threshold=( self.task_config.null_score_diff_threshold), xlnet_format=self.task_config.validation_data.xlnet_format, verbose=False)) with tf.io.gfile.GFile(self.task_config.validation_data.input_path, 'r') as reader: dataset_json = json.load(reader) pred_dataset = dataset_json['data'] if self.task_config.validation_data.version_2_with_negative: eval_metrics = squad_evaluate_v2_0.evaluate( pred_dataset, all_predictions, scores_diff) # Filter out useless metrics, such as start_position_accuracy that # we did not actually compute. eval_metrics = { 'exact_match': eval_metrics['final_exact'], 'exact_match_threshold': eval_metrics['final_exact_thresh'], 'final_f1': eval_metrics['final_f1'] / 100.0, # scale back to [0, 1]. 'f1_threshold': eval_metrics['final_f1_thresh'], 'has_answer_exact_match': eval_metrics['HasAns_exact'], 'has_answer_f1': eval_metrics['HasAns_f1'] } else: eval_metrics = squad_evaluate_v1_1.evaluate( pred_dataset, all_predictions) # Filter out useless metrics, such as start_position_accuracy that # we did not actually compute. eval_metrics = { 'exact_match': eval_metrics['exact_match'], 'final_f1': eval_metrics['final_f1'] } return eval_metrics