def _convert_single_wtq(interaction_file, prediction_file, output_file):
    """Convert predictions to WikiTablequestions format."""

    interactions = dict(
        (prediction_utils.parse_interaction_id(i.id), i)
        for i in prediction_utils.iterate_interactions(interaction_file))
    missing_interaction_ids = set(interactions.keys())

    with tf.io.gfile.GFile(output_file, 'w') as output_file:
        for prediction in prediction_utils.iterate_predictions(
                prediction_file):
            interaction_id = prediction['id']
            if interaction_id in missing_interaction_ids:
                missing_interaction_ids.remove(interaction_id)
            else:
                continue

            coordinates = prediction_utils.parse_coordinates(
                prediction['answer_coordinates'])

            denot_pred, _ = calc_metrics_utils.execute(
                int(prediction.get('pred_aggr', '0')), coordinates,
                prediction_utils.table_to_panda_frame(
                    interactions[interaction_id].table))

            answers = '\t'.join(sorted(map(str, denot_pred)))
            output_file.write('{}\t{}\n'.format(interaction_id, answers))

        for interaction_id in missing_interaction_ids:
            output_file.write('{}\n'.format(interaction_id))
Exemple #2
0
def read_predictions(predictions_path, examples):
  """Reads predictions from a csv file."""
  for row in prediction_utils.iterate_predictions(predictions_path):
    pred_id = '{}-{}_{}'.format(row['id'], row['annotator'], row['position'])
    example = examples[pred_id]
    example.pred_cell_coo = prediction_utils.parse_coordinates(
        row['answer_coordinates'])
    example.pred_agg_function = int(row.get('pred_aggr', '0'))
def generate_hybridqa_codalab_predictions(interaction_file, prediction_file):
    """Generates Codaab prediction files for HybridQA Competition.

  This function generates the json prediction files used to submit to HybridQA
  competition hosted on Codalab. (go/hybridqa-competition)

  Args:
    interaction_file: A TF record file containing the examples as interactions.
    prediction_file: A TSV file that is the output of the table-classifier
      predict job on the input interactions.

  Yields:
    An iterable of json serializable python dicts.
  """
    vocab_file = _guess_vocab_file(interaction_file)
    logging.info("Vocab file: %s ", vocab_file)
    logging.info("Read: %s ", interaction_file)
    interactions = prediction_utils.iterate_interactions(interaction_file)
    logging.info("Read: %s ", prediction_file)
    predictions = prediction_utils.iterate_predictions(prediction_file)

    detokenizer = DeTokenizer(vocab_file)

    interactions_by_qid = collections.defaultdict(list)
    for interaction in interactions:
        qid = interaction.questions[0].id
        interactions_by_qid[_get_example_id(qid)].append(interaction)

    predictions_by_qid = {}
    for prediction in predictions:
        qid = prediction["question_id"]
        # TODO(eisenjulian): Select the best answer using model scores.
        predictions_by_qid[qid] = prediction

    for qid, candidates in interactions_by_qid.items():
        answer_text = ""
        results = list(
            _get_scored_candidates(
                detokenizer,
                candidates,
                predictions_by_qid,
            ))
        example_id = text_utils.get_example_id(qid)
        if results:
            best_result = max(results, key=lambda result: result.score)
            answer_text = best_result.answer

        yield {"question_id": example_id, "pred": answer_text}
Exemple #4
0
def evaluate_retrieval_e2e(
    interaction_file,
    prediction_file,
    references_file = None,
    vocab_file = None,
):
  """Computes e2e retrieval-QA metrics."""
  vocab_file = vocab_file or _guess_vocab_file(interaction_file)
  references = None
  logging.info("Vocab file: %s ", vocab_file)
  logging.info("Read: %s ", interaction_file)
  interactions = prediction_utils.iterate_interactions(interaction_file)
  logging.info("Read: %s ", prediction_file)
  predictions = prediction_utils.iterate_predictions(prediction_file)
  return _evaluate_retrieval_e2e(vocab_file, interactions, predictions,
                                 references)
def get_predictions(prediction_file):
    """Yields an iterable of Prediction objects from a tsv prediction file."""
    fn_map = {
        'logits_cls': float,
        'position': int,
        'answer_coordinates':
        lambda x: list(prediction_utils.parse_coordinates(x)),
        'answers': token_answers_from_text,
        'token_probabilities': json.loads,
    }
    for prediction_dict in prediction_utils.iterate_predictions(
            prediction_file):
        for key in tuple(prediction_dict.keys()):
            fn = fn_map.get(key, lambda x: x)
            prediction_dict[key] = fn(prediction_dict[key])
        yield Prediction(**prediction_dict)
Exemple #6
0
 def test_iterate_predictions(self):
     filepath = tempfile.mktemp(suffix='.tsv')
     predictions = [
         {
             'logits_cls': 0.1
         },
         {
             'logits_cls': [3.0, 4.0]
         },
     ]
     with tf.io.gfile.GFile(filepath, mode='w') as writer:
         writer.write('logits_cls\n')
         writer.write('0.1\n')
         writer.write('[3 4]\n')
     actual_predictions = list(
         prediction_utils.iterate_predictions(filepath))
     self.assertEqual(predictions, actual_predictions)
def read_predictions(predictions_path, examples):
    """Reads predictions from a csv file."""
    for row in prediction_utils.iterate_predictions(predictions_path):
        pred_id = '{}-{}_{}'.format(row['id'], row['annotator'],
                                    row['position'])
        example = examples[pred_id]
        example.pred_cell_coo = prediction_utils.parse_coordinates(
            row['answer_coordinates'])
        example.pred_agg_function = int(row.get('pred_aggr', '0'))
        if 'column_scores' in row:
            column_scores = list(
                filter(None, row['column_scores'][1:-1].split(' ')))
            removed_column_scores = [
                float(score) for score in column_scores if float(score) < 0.0
            ]
            if column_scores:
                example.weight = len(removed_column_scores) / len(
                    column_scores)
Exemple #8
0
def create_outputs(
    input_dir,
    prediction_files,
    output_dir,
):
    """Iterates over XML files in input dir and adds types to statements."""
    prediction_dict = collections.defaultdict(list)
    for prediction_file in prediction_files:
        for row in prediction_utils.iterate_predictions(prediction_file):
            prediction_dict[row["question_id"]].append(row["logits_cls"])

    for filename in tf.io.gfile.listdir(input_dir):
        if not filename.endswith("xml"):
            continue

        filepath = os.path.join(input_dir, filename)
        doc_id = _get_doc_id(filepath)

        with tf.io.gfile.GFile(filepath) as input_file:
            soup = bs4.BeautifulSoup(input_file.read(), "lxml")
            for table in soup.find_all("table"):
                table_id = _get_table_id(doc_id, table["id"])
                for statement in table.find_all("statement"):
                    interaction_id = _get_interaction_id(
                        table_id,
                        int(statement["id"]),
                    )
                    question_id = _get_question_id(interaction_id)
                    prediction_list = prediction_dict[question_id]
                    index = _get_majority_label(prediction_list)
                    statement["type"] = _index_to_type(index)

        tf.io.gfile.makedirs(output_dir)
        output_path = os.path.join(output_dir, filename)
        with tf.io.gfile.GFile(output_path, "w") as output_file:
            output_file.write(str(soup))
def eval_cell_selection(
    questions,
    predictions_file,
):
    """Evaluates cell selection results in HybridQA experiment.

  Args:
    questions: A map of Question protos by their respective ids.
    predictions_file: Path to a tsv file with predictions for a checkpoint.

  Yields:
    An AnswerType and its corresponding CellSelectionMetrics instance
  """
    total = collections.Counter()
    total_correct = collections.Counter()
    total_correct_at_k = {k: collections.Counter() for k in _RECALL_KS}
    total_seen = collections.Counter()
    total_non_empty = collections.Counter()
    total_coordinates = collections.Counter()
    sum_precision = collections.defaultdict(float)

    for question in questions.values():
        for answer_type in [AnswerType.ALL, _get_answer_type(question)]:
            total[answer_type] += 1

    for row in prediction_utils.iterate_predictions(predictions_file):
        question = questions.get(row['question_id'])
        if question is None:
            # The dataset lost some examples after an update.
            continue
        gold_coordinates = {(x.row_index, x.column_index)
                            for x in question.answer.answer_coordinates}
        coordinates = prediction_utils.parse_coordinates(
            row['answer_coordinates'])
        # We only care about finding one correct cell for the downstream model.
        correct_coordinates = len(coordinates & gold_coordinates)
        has_probabilities = 'token_probabilities' in row
        if has_probabilities:
            best_cells = get_best_cells(json.loads(row['token_probabilities']))
            correct_at_k = {
                k: bool(set(best_cells[:k]) & gold_coordinates)
                for k in _RECALL_KS
            }
        else:
            correct_at_k = {}
        for answer_type in [AnswerType.ALL, _get_answer_type(question)]:
            total_coordinates[answer_type] += len(coordinates)
            total_correct[answer_type] += bool(correct_coordinates)
            total_seen[answer_type] += 1
            for k, correct in correct_at_k.items():
                total_correct_at_k[k][answer_type] += correct
            if coordinates:
                sum_precision[answer_type] += correct_coordinates / len(
                    coordinates)
                total_non_empty[answer_type] += 1

    for answer_type in AnswerType:
        if total[answer_type]:
            recall_at_k = {
                f'recall_at_{k}':
                (total_correct_at_k[k][answer_type] /
                 total[answer_type]) if has_probabilities else None
                for k in _RECALL_KS
            }
            yield answer_type, CellSelectionMetrics(
                recall=total_correct[answer_type] / total[answer_type],
                non_empty=total_non_empty[answer_type] / total[answer_type],
                coverage=total_seen[answer_type] / total[answer_type],
                answer_len=total_coordinates[answer_type] / total[answer_type],
                precision=((sum_precision[answer_type] /
                            total_non_empty[answer_type])
                           if total_non_empty[answer_type] else None),
                **recall_at_k,
            )