Beispiel #1
0
    def reduce_aggregated_logs(self, aggregated_logs):
        all_predictions, _, scores_diff = (self.squad_lib.postprocess_output(
            self._eval_examples,
            self._eval_features,
            aggregated_logs,
            self.task_config.n_best_size,
            self.task_config.max_answer_length,
            self.task_config.validation_data.do_lower_case,
            version_2_with_negative=(
                self.task_config.validation_data.version_2_with_negative),
            null_score_diff_threshold=(
                self.task_config.null_score_diff_threshold),
            verbose=False))

        with tf.io.gfile.GFile(self.task_config.validation_data.input_path,
                               'r') as reader:
            dataset_json = json.load(reader)
            pred_dataset = dataset_json['data']
        if self.task_config.validation_data.version_2_with_negative:
            eval_metrics = squad_evaluate_v2_0.evaluate(
                pred_dataset, all_predictions, scores_diff)
        else:
            eval_metrics = squad_evaluate_v1_1.evaluate(
                pred_dataset, all_predictions)
        return eval_metrics
def eval_squad(strategy,
               input_meta_data,
               tokenizer,
               bert_config,
               squad_lib,
               init_checkpoint=None):
    """Get prediction results and evaluate them against ground truth."""
    if init_checkpoint is None:
        init_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir)

    all_predict_files = _get_matched_files(FLAGS.predict_file)
    if len(all_predict_files) != 1:
        raise ValueError('`eval_squad` only supports one predict file, '
                         'but got %s' % all_predict_files)

    squad_model = get_squad_model_to_predict(strategy, bert_config,
                                             init_checkpoint, input_meta_data)
    all_predictions, all_nbest_json, scores_diff_json = prediction_output_squad(
        strategy, input_meta_data, tokenizer, squad_lib, all_predict_files[0],
        squad_model)
    dump_to_files(all_predictions, all_nbest_json, scores_diff_json, squad_lib,
                  input_meta_data.get('version_2_with_negative', False))

    with tf.io.gfile.GFile(FLAGS.predict_file, 'r') as reader:
        dataset_json = json.load(reader)
        pred_dataset = dataset_json['data']
    if input_meta_data.get('version_2_with_negative', False):
        eval_metrics = squad_evaluate_v2_0.evaluate(pred_dataset,
                                                    all_predictions,
                                                    scores_diff_json)
    else:
        eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset,
                                                    all_predictions)
    return eval_metrics
Beispiel #3
0
def eval_squad(strategy,
               input_meta_data,
               tokenizer,
               bert_config,
               squad_lib,
               init_checkpoint=None):
    """Get prediction results and evaluate them against ground truth."""
    if init_checkpoint is None:
        init_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir)
    all_predictions, all_nbest_json, scores_diff_json = prediction_output_squad(
        strategy, input_meta_data, tokenizer, bert_config, squad_lib,
        init_checkpoint)
    dump_to_files(all_predictions, all_nbest_json, scores_diff_json, squad_lib,
                  input_meta_data.get('version_2_with_negative', False))

    with tf.io.gfile.GFile(FLAGS.predict_file, 'r') as reader:
        dataset_json = json.load(reader)
        pred_dataset = dataset_json['data']
    if input_meta_data.get('version_2_with_negative', False):
        eval_metrics = squad_evaluate_v2_0.evaluate(pred_dataset,
                                                    all_predictions,
                                                    scores_diff_json)
    else:
        eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset,
                                                    all_predictions)
    return eval_metrics
Beispiel #4
0
    def evaluate(self, model, input_fn, num_steps, eval_examples,
                 eval_features, predict_file, version_2_with_negative,
                 max_answer_length, null_score_diff_threshold, verbose_logging,
                 output_dir):
        """Evaluate QA model.

    Args:
      model: The model to be evaluated.
      input_fn: Function that returns a tf.data.Dataset used for evaluation.
      num_steps: Number of steps to evaluate the model.
      eval_examples: List of `squad_lib.SquadExample` for evaluation data.
      eval_features: List of `squad_lib.InputFeatures` for evaluation data.
      predict_file: The input predict file.
      version_2_with_negative: Whether the input predict file is SQuAD 2.0
        format.
      max_answer_length: The maximum length of an answer that can be generated.
        This is needed because the start and end predictions are not conditioned
        on one another.
      null_score_diff_threshold: If null_score - best_non_null is greater than
        the threshold, predict null. This is only used for SQuAD v2.
      verbose_logging: If true, all of the warnings related to data processing
        will be printed. A number of warnings are expected for a normal SQuAD
        evaluation.
      output_dir: The output directory to save output to json files:
        predictions.json, nbest_predictions.json, null_odds.json. If None, skip
        saving to json files.

    Returns:
      A dict contains two metrics: Exact match rate and F1 score.
    """
        all_results = self.predict(model, input_fn, num_steps)

        all_predictions, all_nbest_json, scores_diff_json = (
            squad_lib.postprocess_output(
                eval_examples,
                eval_features,
                all_results,
                n_best_size=20,
                max_answer_length=max_answer_length,
                do_lower_case=self.do_lower_case,
                version_2_with_negative=version_2_with_negative,
                null_score_diff_threshold=null_score_diff_threshold,
                verbose=verbose_logging))

        if output_dir is not None:
            dump_to_files(all_predictions, all_nbest_json, scores_diff_json,
                          version_2_with_negative, output_dir)

        dataset_json = file_util.load_json_file(predict_file)
        pred_dataset = dataset_json['data']

        if version_2_with_negative:
            eval_metrics = squad_evaluate_v2_0.evaluate(
                pred_dataset, all_predictions, scores_diff_json)
        else:
            eval_metrics = squad_evaluate_v1_1.evaluate(
                pred_dataset, all_predictions)
        return eval_metrics
Beispiel #5
0
def eval_squad(strategy, input_meta_data, tokenizer, bert_config, squad_lib):
    """Get prediction results and evaluate them against ground truth."""
    all_predictions, all_nbest_json, scores_diff_json = prediction_output_squad(
        strategy, input_meta_data, tokenizer, bert_config, squad_lib)
    dump_to_files(all_predictions, all_nbest_json, scores_diff_json, squad_lib,
                  input_meta_data.get('version_2_with_negative', False))

    if input_meta_data.get('version_2_with_negative', False):
        # TODO(lehou): support in memory evaluation for SQuAD v2.
        logging.error('SQuAD v2 eval is not supported. Skipping eval')
        return None
    else:
        with tf.io.gfile.GFile(FLAGS.predict_file, 'r') as reader:
            dataset_json = json.load(reader)
            pred_dataset = dataset_json['data']
        eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset,
                                                    all_predictions)
        return eval_metrics
Beispiel #6
0
    def reduce_aggregated_logs(self, aggregated_logs):
        all_predictions, _, scores_diff = (self.squad_lib.postprocess_output(
            self._eval_examples,
            self._eval_features,
            aggregated_logs,
            self.task_config.n_best_size,
            self.task_config.max_answer_length,
            self.task_config.validation_data.do_lower_case,
            version_2_with_negative=(
                self.task_config.validation_data.version_2_with_negative),
            null_score_diff_threshold=(
                self.task_config.null_score_diff_threshold),
            xlnet_format=self.task_config.validation_data.xlnet_format,
            verbose=False))

        with tf.io.gfile.GFile(self.task_config.validation_data.input_path,
                               'r') as reader:
            dataset_json = json.load(reader)
            pred_dataset = dataset_json['data']
        if self.task_config.validation_data.version_2_with_negative:
            eval_metrics = squad_evaluate_v2_0.evaluate(
                pred_dataset, all_predictions, scores_diff)
            # Filter out useless metrics, such as start_position_accuracy that
            # we did not actually compute.
            eval_metrics = {
                'exact_match': eval_metrics['final_exact'],
                'exact_match_threshold': eval_metrics['final_exact_thresh'],
                'final_f1':
                eval_metrics['final_f1'] / 100.0,  # scale back to [0, 1].
                'f1_threshold': eval_metrics['final_f1_thresh'],
                'has_answer_exact_match': eval_metrics['HasAns_exact'],
                'has_answer_f1': eval_metrics['HasAns_f1']
            }
        else:
            eval_metrics = squad_evaluate_v1_1.evaluate(
                pred_dataset, all_predictions)
            # Filter out useless metrics, such as start_position_accuracy that
            # we did not actually compute.
            eval_metrics = {
                'exact_match': eval_metrics['exact_match'],
                'final_f1': eval_metrics['final_f1']
            }
        return eval_metrics