Beispiel #1
0
 def test_min_rank(self):
     index = tfidf_baseline_utils.create_inverted_index([
         interaction_pb2.Table(table_id="table_0",
                               document_title="Table A"),
         interaction_pb2.Table(table_id="table_1", document_title="Table B")
     ],
                                                        min_rank=1)
     self.assertEqual(index.retrieve("A"), [("table_0", 1.0)])
     self.assertEqual(index.retrieve("B"), [("table_1", 1.0)])
Beispiel #2
0
 def test_simple(self, drop_term_frequency, expected):
     index = tfidf_baseline_utils.create_inverted_index(
         [
             interaction_pb2.Table(table_id="table_0",
                                   document_title="a a c"),
             interaction_pb2.Table(table_id="table_1", document_title="b c")
         ],
         drop_term_frequency=drop_term_frequency)
     for query, results in expected:
         self.assertEqual(index.retrieve(query), results)
def _get_table(table_id):
  return interaction_pb2.Table(
      columns=[
          interaction_pb2.Cell(text="Position"),
          interaction_pb2.Cell(text="Player"),
          interaction_pb2.Cell(text="Team"),
      ],
      rows=[
          interaction_pb2.Cells(cells=[
              interaction_pb2.Cell(text="1"),
              interaction_pb2.Cell(text="player 1"),
              interaction_pb2.Cell(text="team 1"),
          ]),
          interaction_pb2.Cells(cells=[
              interaction_pb2.Cell(text="2"),
              interaction_pb2.Cell(text="player 2"),
              interaction_pb2.Cell(text="team 2"),
          ]),
          interaction_pb2.Cells(cells=[
              interaction_pb2.Cell(text="1"),
              interaction_pb2.Cell(text="player 3"),
              interaction_pb2.Cell(text="team 2"),
          ]),
      ],
      table_id=table_id,
  )
 def test_single_cell(self, cell, text, exepected=None):
   with tempfile.TemporaryDirectory() as temp_dir:
     vocab_file = os.path.join(temp_dir, "vocab.txt")
     self._get_vocab_file(
         vocab_file,
         [
             "a",
             "b",
             "bb",
             "##b",
             "3",
             ".",
             "5",
             "insti",
             "##tuto",
             "reacao",
             "##d",
         ],
     )
     detokenizer = e2e_eval_utils.DeTokenizer(vocab_file)
     tokenizer = tokenization.FullTokenizer(
         vocab_file,
         do_lower_case=True,
         split_on_punc=True,
     )
   table = interaction_pb2.Table()
   table.rows.add().cells.add().text = cell
   token_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
   actual = detokenizer.detokenize(
       table,
       token_ids,
   )
   if exepected is None:
     exepected = text
   self.assertEqual(actual, exepected)
Beispiel #5
0
def _add_tables(input_dir, interaction_dict):
    """Adds table protos to all interactions."""
    table_files = set()
    for interactions in interaction_dict.values():
        for interaction in interactions:
            table_files.add(interaction.table.table_id)

    table_dict = {}
    for index, table_file in enumerate(sorted(table_files)):
        logging.log_every_n(logging.INFO, 'Read %4d / %4d table files', 100,
                            index, len(table_files))
        table_path = os.path.join(input_dir, table_file)
        with tf.io.gfile.GFile(table_path, 'r') as table_handle:
            table = interaction_pb2.Table()
            rows = list(csv.reader(table_handle))
            headers, rows = rows[0], rows[1:]

            for header in headers:
                table.columns.add().text = header

            for row in rows:
                new_row = table.rows.add()
                for cell in row:
                    new_row.cells.add().text = cell

            table.table_id = table_file
            table_dict[table_file] = table

    for interactions in interaction_dict.values():
        for interaction in interactions:
            interaction.table.CopyFrom(table_dict[interaction.table.table_id])
Beispiel #6
0
def _convert_data(
    all_questions,
    input_file,
    tables,
):
    """Converts TabFact data to interactions format."""
    logging.info('Converting data from: %s...', input_file)

    counter = collections.Counter()  # Counter for stats.

    with tf.io.gfile.GFile(input_file) as file_in:
        for table_id in json.load(file_in):
            questions, labels, _ = all_questions[table_id]
            for i, (text, label) in enumerate(zip(questions, labels)):
                # The extra zeros are there to match SQA id format.
                question_id = f'{table_id}_{i}-0'
                question = interaction_pb2.Question(
                    id=f'{question_id}_0',
                    original_text=text,
                    answer=interaction_pb2.Answer(class_index=label))
                table = interaction_pb2.Table()
                table.CopyFrom(tables[table_id])
                yield interaction_pb2.Interaction(id=question_id,
                                                  questions=[question],
                                                  table=table)

                counter['questions'] += 1
                if counter['questions'] % 1000 == 0:
                    logging.info('Processed %d questions...',
                                 counter['questions'])

        _log_stats(counter, input_file)
Beispiel #7
0
 def test_multi_cell_table(self):
     coordinates = [(0, 0), (1, 1)]
     interactions = list(
         hybridqa_rc_utils._create_eval_answer_interactions(
             self.input_interaction, coordinates))
     self.assertLen(interactions, 1)
     self.assertEqual(
         interactions[0].table,
         text_format.Parse(
             """
 columns {
   text: ""
 }
 columns {
   text: ""
 }
 rows {
   cells {
     text: "Jessica"
   }
   cells {
     text: "Mathematics : The abstract science of number, quantity, and space."
     [language.tapas.AnnotatedText.annotated_cell_ext] {
       annotations {
         identifier: "/wiki/Mathematics"
       }
     }
   }
 }
 table_id: "0"
 document_title: "Earth"
 document_url: "https://en.wikipedia.org/wiki/Earth"
 """, interaction_pb2.Table()))
def _create_answer_table(
    original_table,
    descriptions,
    answer_coordinates,
):
    """Converts a HybridQA Table to an expanded HybridQA RC Table."""
    answer_table = interaction_pb2.Table(
        table_id=original_table.table_id,
        document_title=original_table.document_title,
        document_url=original_table.document_url)
    row = answer_table.rows.add()
    for coords in answer_coordinates:
        row_index, col_index = coords
        answer_table.columns.add().text = ''
        original_cell = original_table.rows[row_index].cells[col_index]
        links = [
            annotation.identifier for annotation in
            original_cell.Extensions[_annotated_cell].annotations
        ]
        _parse_answer_cell(row.cells.add(),
                           original_cell.text,
                           links,
                           descriptions,
                           url_unquote=False)
    return answer_table
Beispiel #9
0
 def test_interaction_duplicate_column_name(self):
     """Test we don't crash when seeing ambiguous column names."""
     config = synthesize_entablement.SynthesizationConfig(attempts=10)
     interaction = interaction_pb2.Interaction(
         id='i_id',
         table=interaction_pb2.Table(
             table_id='t_id',
             columns=[
                 interaction_pb2.Cell(text='Name'),
                 interaction_pb2.Cell(text='Name'),
                 interaction_pb2.Cell(text='Height')
             ],
             rows=[
                 interaction_pb2.Cells(cells=[
                     interaction_pb2.Cell(text='Peter'),
                     interaction_pb2.Cell(text='Peter'),
                     interaction_pb2.Cell(text='100')
                 ]),
                 interaction_pb2.Cells(cells=[
                     interaction_pb2.Cell(text='Bob'),
                     interaction_pb2.Cell(text='Bob'),
                     interaction_pb2.Cell(text='150')
                 ]),
                 interaction_pb2.Cells(cells=[
                     interaction_pb2.Cell(text='Tina'),
                     interaction_pb2.Cell(text='Tina'),
                     interaction_pb2.Cell(text='200')
                 ]),
             ]),
         questions=[])
     for i in range(20):
         rng = np.random.RandomState(i)
         synthesize_entablement.synthesize_from_interaction(
             config, rng, interaction, synthesize_entablement.Counter())
Beispiel #10
0
def _merge_tables_interactions(
    key_join,
    max_num_negatives,
):
  """Joins the interactions and multiple similar table id by question id.

  Args:
    key_join: Input to merge
    max_num_negatives: Max similar tables to add. None means no limit.

  Yields:
    Merged interactions.
  """
  _, join = key_join
  if len(join["interactions"]) > 1:
    beam.metrics.Metrics.counter(
        _NS, "DulicatedQuestionIds_Interactions_" +
        str(len(join["interactions"]))).inc()
  elif not join["interactions"]:
    beam.metrics.Metrics.counter(_NS,
                                 "QuestionIds_WithoutInteraction_Jsonl").inc()
  if join["interactions"]:
    interaction = join["interactions"][0]
    tables = join["tables"]
    sorted_tables = sorted(tables, key=lambda t: t[2])

    table_ids = set()
    true_negatives = []
    for table, score, rank in sorted_tables:
      if max_num_negatives is not None:
        if len(true_negatives) >= max_num_negatives:
          break
      if table.table_id in table_ids:
        continue
      table_ids.add(table.table_id)
      if table.table_id == interaction.table.table_id:
        continue
      if preprocess_nq_utils.table_contains_all_answers(
          table, interaction.questions[0]):
        continue
      true_negatives.append(_create_negative_example(table, score, rank))

    if not true_negatives:
      # Make sure we don't drop interactions.
      beam.metrics.Metrics.counter(_NS, "Interactions_WitFakeTable").inc()
      fake_table = interaction_pb2.Table()
      fake_table.table_id = "FAKE"
      fake_table.columns.add()
      fake_table.rows.add().cells.add()
      true_negatives.append(_create_negative_example(fake_table, 0.0, 0))

    if true_negatives:
      beam.metrics.Metrics.counter(_NS, "Interaction_With_True_negatives").inc()
      yield _create_interaction_with_negative_tables(interaction,
                                                     true_negatives)
    else:
      beam.metrics.Metrics.counter(_NS,
                                   "Interaction_Without_True_negatives").inc()
Beispiel #11
0
def _to_table_proto(table):
    new_table = interaction_pb2.Table()
    for column in table.columns:
        new_table.columns.add().text = column
    for row in table.rows:
        new_row = new_table.rows.add()
        for cell in row.cells:
            new_row.cells.add().text = cell.value
    return new_table
def read_from_tsv_file(file_handle):
    """Parses a TSV file in SQA format into a list of interactions.

  Args:
    file_handle:  File handle of a TSV file in SQA format.

  Returns:
    Questions grouped into interactions.
  """
    questions = {}
    for row in csv.DictReader(file_handle, delimiter='\t'):
        sequence_id = text_utils.get_sequence_id(row[_ID], row[_ANNOTATOR])
        key = sequence_id, row[_TABLE_FILE]
        if key not in questions:
            questions[key] = {}

        position = int(row[_POSITION])

        answer = interaction_pb2.Answer()
        _parse_answer_coordinates(row[_ANSWER_COORDINATES], answer)
        _parse_answer_text(row[_ANSWER_TEXT], answer)

        if _AGGREGATION in row:
            agg_func = row[_AGGREGATION].upper().strip()
            if agg_func:
                answer.aggregation_function = _AggregationFunction.Value(
                    agg_func)
        if _ANSWER_FLOAT_VALUE in row:
            float_value = row[_ANSWER_FLOAT_VALUE]
            if float_value:
                answer.float_value = float(float_value)
        if _ANSWER_CLASS_INDEX in row:
            class_index = row[_ANSWER_CLASS_INDEX]
            if class_index:
                answer.class_index = int(class_index)

        questions[key][position] = interaction_pb2.Question(
            id=text_utils.get_question_id(sequence_id, position),
            original_text=row[_QUESTION],
            answer=answer)

    interactions = []
    for (sequence_id,
         table_file), question_dict in sorted(questions.items(),
                                              key=lambda sid: sid[0]):
        question_list = [
            question for _, question in sorted(question_dict.items(),
                                               key=lambda pos: pos[0])
        ]
        interactions.append(
            interaction_pb2.Interaction(
                id=sequence_id,
                questions=question_list,
                table=interaction_pb2.Table(table_id=table_file)))
    return interactions
 def test_convert(self):
     max_seq_length = 12
     with tempfile.TemporaryDirectory() as input_dir:
         vocab_file = os.path.join(input_dir, 'vocab.txt')
         _create_vocab(vocab_file, range(10))
         converter = tf_example_utils.ToClassifierTensorflowExample(
             config=tf_example_utils.ClassifierConversionConfig(
                 vocab_file=vocab_file,
                 max_seq_length=max_seq_length,
                 max_column_id=max_seq_length,
                 max_row_id=max_seq_length,
                 strip_column_names=False,
                 add_aggregation_candidates=False,
             ))
         interaction = interaction_pb2.Interaction(
             table=interaction_pb2.Table(
                 columns=[
                     interaction_pb2.Cell(text='A'),
                     interaction_pb2.Cell(text='B'),
                     interaction_pb2.Cell(text='C'),
                 ],
                 rows=[
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='0'),
                         interaction_pb2.Cell(text='4'),
                         interaction_pb2.Cell(text='5'),
                     ]),
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='1'),
                         interaction_pb2.Cell(text='3'),
                         interaction_pb2.Cell(text='5'),
                     ]),
                 ],
             ),
             questions=[
                 interaction_pb2.Question(id='id', original_text='2')
             ],
         )
         number_annotation_utils.add_numeric_values(interaction)
         example = converter.convert(interaction, 0)
         logging.info(example)
         self.assertEqual(_get_int_feature(example, 'input_ids'),
                          [2, 8, 3, 1, 1, 1, 6, 10, 11, 7, 9, 11])
         self.assertEqual(_get_int_feature(example, 'row_ids'),
                          [0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2])
         self.assertEqual(_get_int_feature(example, 'column_ids'),
                          [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3])
         self.assertEqual(_get_int_feature(example, 'column_ranks'),
                          [0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 1])
         self.assertEqual(_get_int_feature(example, 'numeric_relations'),
                          [0, 0, 0, 0, 0, 0, 4, 2, 2, 4, 2, 2])
         self.assertEqual(
             _get_float_feature(example, 'question_numeric_values'),
             _clean_nans([2.0] + [_NAN] * (_MAX_NUMERIC_VALUES - 1)))
Beispiel #14
0
def _convert_table(table_id, table_text):
    """Parses a table from # separated values format into proto format."""
    rows = []
    with six.StringIO(table_text) as csv_in:
        for index, row in enumerate(csv.reader(csv_in, delimiter='#')):
            cells = [interaction_pb2.Cell(text=text) for text in row]
            if index == 0:
                columns = cells
            else:
                rows.append(interaction_pb2.Cells(cells=cells))
    return interaction_pb2.Table(table_id=f'{_TABLE_DIR_NAME}/{table_id}',
                                 columns=columns,
                                 rows=rows)
Beispiel #15
0
 def test_convert_with_context_heading(self):
     max_seq_length = 20
     with tempfile.TemporaryDirectory() as input_dir:
         vocab_file = os.path.join(input_dir, 'vocab.txt')
         _create_vocab(vocab_file, ['a', 'b', 'c', 'd', 'e'])
         converter = tf_example_utils.ToClassifierTensorflowExample(
             config=tf_example_utils.ClassifierConversionConfig(
                 vocab_file=vocab_file,
                 max_seq_length=max_seq_length,
                 max_column_id=max_seq_length,
                 max_row_id=max_seq_length,
                 strip_column_names=False,
                 add_aggregation_candidates=False,
                 use_document_title=True,
                 use_context_title=True,
                 update_answer_coordinates=True,
             ))
         interaction = interaction_pb2.Interaction(
             table=interaction_pb2.Table(
                 document_title='E E',
                 columns=[
                     interaction_pb2.Cell(text='A'),
                     interaction_pb2.Cell(text='A B C'),
                 ],
                 rows=[
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='A B'),
                         interaction_pb2.Cell(text='A B C'),
                     ]),
                 ],
                 context_heading='B',
             ),
             questions=[
                 interaction_pb2.Question(
                     id='id',
                     original_text='D',
                     answer=interaction_pb2.Answer(answer_texts=['B C']),
                 )
             ],
         )
         example = converter.convert(interaction, 0)
         logging.info(example)
         self.assertEqual(
             _get_int_feature(example, 'input_ids'),
             [2, 5, 3, 10, 10, 3, 7, 3, 6, 6, 7, 8, 6, 7, 6, 7, 8, 0, 0, 0])
         self.assertEqual(
             _get_int_feature(example, 'label_ids'),
             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])
Beispiel #16
0
def _build_table(columns, rows, **kwargs):
    table = interaction_pb2.Table()
    for column in columns:
        table.columns.add().text = column
    for row in rows:
        new_row = table.rows.add()
        for cell in row:
            new_row.cells.add().text = cell
    for key, value in kwargs.items():
        if key == 'table_id':
            table.table_id = value
        elif key == 'document_url':
            table.document_url = value
        else:
            raise ValueError(f'Unknown argument: {key}')
    return table
Beispiel #17
0
 def test_convert_with_trimmed_cell(self):
     max_seq_length = 16
     with tempfile.TemporaryDirectory() as input_dir:
         vocab_file = os.path.join(input_dir, 'vocab.txt')
         _create_vocab(vocab_file, range(10))
         converter = tf_example_utils.ToClassifierTensorflowExample(
             config=tf_example_utils.ClassifierConversionConfig(
                 vocab_file=vocab_file,
                 max_seq_length=max_seq_length,
                 max_column_id=max_seq_length,
                 max_row_id=max_seq_length,
                 strip_column_names=False,
                 add_aggregation_candidates=False,
                 cell_trim_length=2,
                 drop_rows_to_fit=True))
         interaction = interaction_pb2.Interaction(
             table=interaction_pb2.Table(
                 columns=[
                     interaction_pb2.Cell(text='A'),
                     interaction_pb2.Cell(text='A A'),
                     interaction_pb2.Cell(text='A A A A'),
                 ],
                 rows=[
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                     ]),
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                     ]),
                 ],
             ),
             questions=[
                 interaction_pb2.Question(id='id', original_text='A')
             ],
         )
         number_annotation_utils.add_numeric_values(interaction)
         example = converter.convert(interaction, 0)
         logging.info(example)
         # We expect the second row to be dropped all cells should be trimmed to
         # >= 2 tokens.
         self.assertEqual(_get_int_feature(example, 'column_ids'),
                          [0, 0, 0, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 0, 0])
Beispiel #18
0
 def test_simple_bm25(self):
     expected = [("AA", [("table_0", 1.5475852968796064)]),
                 ("BB", [("table_1", 1.2426585328757855)]),
                 ("AA CC", [("table_0", 2.0749815245480145),
                            ("table_1", 0.668184203698534)])]
     index = tfidf_baseline_utils.create_bm25_index([
         interaction_pb2.Table(table_id="table_0",
                               document_title="aa aa cc"),
         interaction_pb2.Table(table_id="table_1", document_title="bb cc"),
         interaction_pb2.Table(table_id="table_2", document_title="dd"),
         interaction_pb2.Table(table_id="table_3", document_title="ee"),
         interaction_pb2.Table(table_id="table_4", document_title="ff"),
         interaction_pb2.Table(table_id="table_5", document_title="gg"),
         interaction_pb2.Table(table_id="table_6", document_title="hh"),
     ])
     for query, results in expected:
         self.assertEqual(index.retrieve(query), results)
Beispiel #19
0
def parse_table(
    json_dict,
    descriptions,
):
    """Converts Table in JSON format to Table proto."""
    table = interaction_pb2.Table()
    table.table_id = str(json_dict['uid'])
    table.document_title = json_dict['title']
    table.document_url = json_dict['url']

    for text, links in json_dict['header']:
        _parse_cell(table.columns.add(), text, links, descriptions)

    for row_data in json_dict['data']:
        row = table.rows.add()
        for text, links in row_data:
            _parse_cell(row.cells.add(), text, links, descriptions)

    return table
Beispiel #20
0
def _get_table_proto(
    table_id,
    document_title,
    document_url,
    table_dict,
):
    """Converts a table dictionary to a Table proto."""
    table = interaction_pb2.Table()
    table.table_id = table_id
    table.document_title = document_title
    table.document_url = document_url

    for column in table_dict.header:
        table.columns.add().text = column

    for row in table_dict.rows:
        new_row = table.rows.add()
        for cell in row:
            new_row.cells.add().text = cell
    return table
Beispiel #21
0
 def _get_interaction(self):
     return interaction_pb2.Interaction(
         table=interaction_pb2.Table(
             columns=[
                 interaction_pb2.Cell(text="A:/, c"),
                 interaction_pb2.Cell(text="B"),
                 interaction_pb2.Cell(text="C"),
             ],
             rows=[
                 interaction_pb2.Cells(cells=[
                     interaction_pb2.Cell(text="0"),
                     interaction_pb2.Cell(text="4"),
                     interaction_pb2.Cell(text="6"),
                 ]),
                 interaction_pb2.Cells(cells=[
                     interaction_pb2.Cell(text="1"),
                     interaction_pb2.Cell(text="3"),
                     interaction_pb2.Cell(text="5"),
                 ]),
             ],
         ),
         questions=[
             interaction_pb2.Question(
                 id="id-1",
                 original_text="A is 5",
                 text="A is 5",
                 answer=interaction_pb2.Answer(answer_coordinates=[
                     interaction_pb2.AnswerCoordinate(row_index=2,
                                                      column_index=2),
                     interaction_pb2.AnswerCoordinate(row_index=0,
                                                      column_index=2),
                 ])),
             interaction_pb2.Question(id="id-2",
                                      original_text="B is A",
                                      text="A is 5 B is A")
         ],
     )
Beispiel #22
0
    def test_convert_with_token_selection(self):
        max_seq_length = 12
        with tempfile.TemporaryDirectory() as input_dir:
            vocab_file = os.path.join(input_dir, 'vocab.txt')
            _create_vocab(vocab_file, range(10))
            converter = tf_example_utils.ToClassifierTensorflowExample(
                config=tf_example_utils.ClassifierConversionConfig(
                    vocab_file=vocab_file,
                    max_seq_length=max_seq_length,
                    max_column_id=max_seq_length,
                    max_row_id=max_seq_length,
                    strip_column_names=False,
                    add_aggregation_candidates=False,
                ))
            interaction = interaction_pb2.Interaction(
                table=interaction_pb2.Table(
                    columns=[
                        interaction_pb2.Cell(text='A'),
                        interaction_pb2.Cell(text='B'),
                        interaction_pb2.Cell(text='C'),
                    ],
                    rows=[
                        interaction_pb2.Cells(cells=[
                            interaction_pb2.Cell(text='0 6'),
                            interaction_pb2.Cell(text='4 7'),
                            interaction_pb2.Cell(text='5 6'),
                        ]),
                        interaction_pb2.Cells(cells=[
                            interaction_pb2.Cell(text='1 7'),
                            interaction_pb2.Cell(text='3 6'),
                            interaction_pb2.Cell(text='5 5'),
                        ]),
                    ],
                ),
                questions=[
                    interaction_pb2.Question(id='id', original_text='2')
                ],
            )
            table_coordinates = []
            for r, c, t in [(0, 0, 0), (1, 0, 0), (1, 2, 0), (2, 0, 0),
                            (2, 2, 0), (2, 2, 1)]:
                table_coordinates.append(
                    table_selection_pb2.TableSelection.TokenCoordinates(
                        row_index=r, column_index=c, token_index=t))
            interaction.questions[0].Extensions[
                table_selection_pb2.TableSelection.
                table_selection_ext].CopyFrom(
                    table_selection_pb2.TableSelection(
                        selected_tokens=table_coordinates))

            number_annotation_utils.add_numeric_values(interaction)
            example = converter.convert(interaction, 0)
            logging.info(example)
            self.assertEqual(_get_int_feature(example, 'input_ids'),
                             [2, 8, 3, 1, 6, 11, 7, 11, 11, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'row_ids'),
                             [0, 0, 0, 0, 1, 1, 2, 2, 2, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'column_ids'),
                             [0, 0, 0, 1, 1, 3, 1, 3, 3, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'column_ranks'),
                             [0, 0, 0, 0, 1, 1, 2, 1, 1, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'numeric_relations'),
                             [0, 0, 0, 0, 4, 2, 4, 2, 2, 0, 0, 0])
Beispiel #23
0
 def test_possible_interactions(self, text_proto_table, size):
     table = text_format.Parse(text_proto_table, interaction_pb2.Table())
     self.assertEqual(hybridqa_rc_utils.get_table_dimensions(table), size)
Beispiel #24
0
 def test_convert_with_negative_tables(self):
     max_seq_length = 12
     with tempfile.TemporaryDirectory() as input_dir:
         vocab_file = os.path.join(input_dir, 'vocab.txt')
         _create_vocab(vocab_file, range(10))
         converter = tf_example_utils.ToRetrievalTensorflowExample(
             config=tf_example_utils.RetrievalConversionConfig(
                 vocab_file=vocab_file,
                 max_seq_length=max_seq_length,
                 max_column_id=max_seq_length,
                 max_row_id=max_seq_length,
                 strip_column_names=False,
             ))
         interaction = interaction_pb2.Interaction(
             table=interaction_pb2.Table(
                 columns=[
                     interaction_pb2.Cell(text='A'),
                     interaction_pb2.Cell(text='B'),
                     interaction_pb2.Cell(text='C'),
                 ],
                 rows=[
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='0 6'),
                         interaction_pb2.Cell(text='4 7'),
                         interaction_pb2.Cell(text='5 6'),
                     ]),
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='1 7'),
                         interaction_pb2.Cell(text='3 6'),
                         interaction_pb2.Cell(text='5 5'),
                     ]),
                 ],
                 table_id='table_0',
             ),
             questions=[
                 interaction_pb2.Question(
                     id='id',
                     original_text='2',
                 )
             ],
         )
         number_annotation_utils.add_numeric_values(interaction)
         n_table = interaction_pb2.Table(
             columns=[
                 interaction_pb2.Cell(text='A'),
                 interaction_pb2.Cell(text='B'),
             ],
             rows=[
                 interaction_pb2.Cells(cells=[
                     interaction_pb2.Cell(text='0 6'),
                     interaction_pb2.Cell(text='4 7'),
                 ]),
                 interaction_pb2.Cells(cells=[
                     interaction_pb2.Cell(text='1 7'),
                     interaction_pb2.Cell(text='3 6'),
                 ]),
             ],
             table_id='table_1',
         )
         number_annotation_utils.add_numeric_table_values(n_table)
         n_example = _NegativeRetrievalExample()
         n_example.table.CopyFrom(n_table)
         n_example.score = -82.0
         n_example.rank = 1
         example = converter.convert(interaction, 0, n_example)
         logging.info(example)
         self.assertEqual(_get_int_feature(example, 'input_ids'), [
             2, 5, 3, 1, 1, 1, 6, 10, 11, 7, 9, 11, 2, 5, 3, 1, 1, 6, 10, 7,
             9, 0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'row_ids'), [
             0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 2, 2,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'column_ids'), [
             0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 0, 0, 0, 1, 2, 1, 2, 1, 2,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'segment_ids'), [
             0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'input_mask'), [
             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'inv_column_ranks'), [
             0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 2, 1, 0, 0, 0, 0, 0, 2, 1, 1, 2,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'column_ranks'), [
             0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 1, 0, 0, 0, 0, 0, 1, 2, 2, 1,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'numeric_relations'), [
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0
         ])
         self.assertEqual(_get_int_feature(example, 'table_id_hash'),
                          [911224864, 1294380046])
         self.assertEqual(_get_float_feature(example, 'numeric_values'), [
             'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 0.0, 4.0, 5.0, 1.0,
             3.0, 5.0, 'nan', 'nan', 'nan', 'nan', 'nan', 0.0, 4.0, 1.0,
             3.0, 'nan', 'nan', 'nan'
         ])
         self.assertEqual(
             _get_float_feature(example, 'numeric_values_scale'), [
                 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
             ])
         self.assertEqual([
             i.decode('utf-8')
             for i in _get_byte_feature(example, 'table_id')
         ], ['table_0', 'table_1'])
Beispiel #25
0
  def test_table_values(self, row_updates, expected_updates,
                        min_consolidation_fraction):

    expected_table = text_format.Parse(
        """
          columns {
            text: 'Name'
          }
          columns {
            text: 'Number'
          }
          columns {
            text: 'Date'
          }
          rows {
            cells {
              text: 'A'
            }
            cells {
              text: '1'
            }
            cells {
              text: 'August 2014'
            }
          }
          rows {
            cells {
              text: 'B'
            }
            cells {
              text: '2'
            }
            cells {
              text: 'July 7'
            }
          }
          rows {
            cells {
              text: 'C'
            }
            cells {
              text: '3'
            }
            cells {
              text: 'March 17, 2015'
            }
          }
    """, interaction_pb2.Table())

    for row_index, col_index, text in row_updates:
      expected_table.rows[row_index].cells[col_index].text = text

    actual_table = interaction_pb2.Table()
    actual_table.CopyFrom(expected_table)
    number_annotation_utils.add_numeric_table_values(
        actual_table, min_consolidation_fraction=min_consolidation_fraction)

    expected_table.rows[0].cells[1].numeric_value.CopyFrom(_number(1))
    expected_table.rows[1].cells[1].numeric_value.CopyFrom(_number(2))
    expected_table.rows[2].cells[1].numeric_value.CopyFrom(_number(3))
    expected_table.rows[0].cells[2].numeric_value.CopyFrom(
        _date(year=2014, month=8))
    expected_table.rows[1].cells[2].numeric_value.CopyFrom(
        _date(month=7, day=7))
    expected_table.rows[2].cells[2].numeric_value.CopyFrom(
        _date(year=2015, month=3, day=17))
    for col_index, new_dict in expected_updates:
      for row_index in range(len(expected_table.rows)):
        expected_table.rows[row_index].cells[col_index].ClearField(
            'numeric_value')
        if row_index in new_dict:
          expected_table.rows[row_index].cells[
              col_index].numeric_value.CopyFrom(new_dict[row_index])

    self.assertEqual(expected_table, actual_table)
Beispiel #26
0
def iterate_tables(table_file):
    for value in tf.python_io.tf_record_iterator(table_file):
        table = interaction_pb2.Table()
        table.ParseFromString(value)
        yield table
def _init_answer_table(idx, title, url):
    """Initializes an empty Table proto with index and title."""
    return interaction_pb2.Table(table_id=str(idx),
                                 document_title=title,
                                 document_url=url)
Beispiel #28
0
  def test_add_entity_descriptions_to_table(self):
    annotated_cell_ext = annotated_text_pb2.AnnotatedText.annotated_cell_ext
    table = interaction_pb2.Table(
        columns=[
            interaction_pb2.Cell(text='A'),
            interaction_pb2.Cell(text='B'),
            interaction_pb2.Cell(text='C'),
        ],
        rows=[
            interaction_pb2.Cells(cells=[
                interaction_pb2.Cell(text='0 6'),
                interaction_pb2.Cell(text='4 7'),
                interaction_pb2.Cell(text='5 6'),
            ]),
            interaction_pb2.Cells(cells=[
                interaction_pb2.Cell(text='1 7'),
                interaction_pb2.Cell(text='3 6'),
                interaction_pb2.Cell(text='5 5'),
            ]),
        ],
    )
    # Add some annotations to the table
    entities = ['0', '3']
    for row in table.rows:
      for cell in row.cells:
        for entity in entities:
          if entity in cell.text:
            cell.Extensions[annotated_cell_ext].annotations.add(
                identifier=f'/wiki/{entity}',)

    question = interaction_pb2.Question(
        id='id', text='What prime number has religious meaning?')
    descriptions = {
        '/wiki/0': ('0 (zero) is a number, and the digit used to represent ' +
                    'that number in numerals. It fulfills a central role in ' +
                    'mathematics as the additive identity of the integers.'),
        '/wiki/3':
            ('3 (three) is a number, numeral, and glyph. It is the natural ' +
             'number following 2 and preceding 4, and is the smallest odd ' +
             'prime number. It has religious or cultural significance.')
    }
    expected_table = interaction_pb2.Table()
    expected_table.CopyFrom(table)
    # Only the top two sentences are used, based on tf-idf score
    expected_table.rows[1].cells[1].text = (
        '3 6 ( It is the natural number following 2 and preceding 4, and is ' +
        'the smallest odd prime number. It has religious or cultural ' +
        'significance. )')

    table_without_titles = interaction_pb2.Table()
    table_without_titles.CopyFrom(table)
    tf_example_utils._add_entity_descriptions_to_table(
        question,
        descriptions,
        table_without_titles,
        num_results=2,
        use_entity_title=False)
    self.assertEqual(table_without_titles, expected_table)

    table_with_titles = interaction_pb2.Table()
    table_with_titles.CopyFrom(table)
    expected_table.rows[1].cells[1].text = (
        '3 6 ( 3 : It is the natural number following 2 and preceding 4, and ' +
        'is the smallest odd prime number. It has religious or cultural ' +
        'significance. )')
    tf_example_utils._add_entity_descriptions_to_table(
        question,
        descriptions,
        table_with_titles,
        num_results=2,
        use_entity_title=True)
    self.assertEqual(table_with_titles, expected_table)
Beispiel #29
0
    def test_interaction(self, prob_count_aggregation):
        config = synthesize_entablement.SynthesizationConfig(
            prob_count_aggregation=prob_count_aggregation, attempts=10)
        interaction = interaction_pb2.Interaction(
            id='i_id',
            table=interaction_pb2.Table(
                table_id='t_id',
                columns=[
                    interaction_pb2.Cell(text='Name'),
                    interaction_pb2.Cell(text='Height'),
                    interaction_pb2.Cell(text='Age')
                ],
                rows=[
                    interaction_pb2.Cells(cells=[
                        interaction_pb2.Cell(text='Peter'),
                        interaction_pb2.Cell(text='100'),
                        interaction_pb2.Cell(text='15')
                    ]),
                    interaction_pb2.Cells(cells=[
                        interaction_pb2.Cell(text='Bob'),
                        interaction_pb2.Cell(text='150'),
                        interaction_pb2.Cell(text='15')
                    ]),
                    interaction_pb2.Cells(cells=[
                        interaction_pb2.Cell(text='Tina'),
                        interaction_pb2.Cell(text='200'),
                        interaction_pb2.Cell(text='17')
                    ]),
                ]),
            questions=[])

        pos_statements = set()
        neg_statements = set()

        counter = TestCounter()

        for i in range(100):
            rng = np.random.RandomState(i)
            interactions = synthesize_entablement.synthesize_from_interaction(
                config, rng, interaction, counter)
            for new_interaction in interactions:
                question = new_interaction.questions[0]
                if question.answer.class_index == 1:
                    pos_statements.add(question.text)
                else:
                    assert question.answer.class_index == 0
                    neg_statements.add(question.text)
        self.assertEqual(neg_statements, pos_statements)
        logging.info('pos_statements: %s', pos_statements)

        counts = counter.get_counts()
        logging.info('counts: %s', counts)

        is_count_test = prob_count_aggregation == 1.0

        if is_count_test:
            self.assertGreater(len(pos_statements), 10)
            expected_statements = {
                '1 is less than the count when age is 15 and height is greater than 100',
                '1 is less than the count when height is less than 200 and age is 15',
                '1 is the count when height is greater than 100 and age is less than 17',
                '2 is the count when age is less than 17 and height is less than 200',
            }
        else:
            self.assertGreater(len(pos_statements), 100)
            expected_statements = {
                '0 is the range of age when height is greater than 100',
                '100 is less than the last height when height is less than 200',
                '125 is greater than height when name is peter',
                '15 is age when height is less than 150',
                '15 is the last age when height is less than 200',
                '150 is the last height when age is 15',
                '175 is the average height when age is less than 17',
                '200 is greater than the greatest height when age is less than 17',
                '250 is less than the total height when age is 15',
                '30 is less than the total age when height is greater than 100',
                'bob is name when age is greater than 15',
                'bob is the first name when age is 15 and height is less than 200',
                'peter is name when age is 15 and height is less than 150',
                'the average height when age is 15 is less than 175',
                'the first height when height is greater than 100 is 150',
                'the first height when height is less than 200 is 150',
                'the first name when age is 15 is name when name is peter',
                'the greatest height when age is 15 is less than 200',
                'the last age when height is greater than 100 is greater than 15',
                'the last name when age is 15 is bob',
                'the last name when age is less than 17 is peter',
                'the last name when height is greater than 100 is bob',
                'the last name when height is less than 200 is bob',
                'the lowest height when age is 15 is 150',
                'the range of age when height is greater than 100 is greater than 0',
                'the range of height when age is 15 is 100',
                'tina is name when age is greater than 15 and height is 200',
                'tina is the first name when age is 15',
                'tina is the last name when age is 15',
                'tina is the last name when height is greater than 100',
            }

        for statement in expected_statements:
            self.assertIn(statement, pos_statements)

        for name in ['pos', 'neg']:
            if is_count_test:
                self.assertGreater(counts[f'{name}: Synthesization success'],
                                   10)
                self.assertGreater(counts[f'{name}: Select: COUNT'], 10)
            else:
                self.assertEqual(counts[f'{name}: Synthesization success'],
                                 100)
                for aggregation in Aggregation:
                    self.assertGreater(
                        counts[f'{name}: Select: {aggregation.name}'], 0)
            for comparator in Comparator:
                min_count = 1 if prob_count_aggregation == 1.0 else 10
                self.assertGreater(
                    counts[f'{name}: Comparator {comparator.name}'], min_count)
                self.assertGreater(
                    counts[f'{name}: where: Comparator {comparator.name}'],
                    min_count)