Python ClassifierConversionConfigの例、tapas.utils.tf_example_utils.ClassifierConversionConfig Pythonの例

コード例 #1

0

ファイルを表示

    def test_end_to_end(self, runner_type, add_example_conversion):
        mode = intermediate_pretrain_utils.Mode.ALL
        prob_count_aggregation = 0.2
        use_fake_table = False
        add_opposite_table = False
        drop_without_support_rate = 0.0

        with tempfile.TemporaryDirectory() as temp_dir:
            config = None
            if add_example_conversion:
                vocab_path = os.path.join(temp_dir, "vocab.txt")
                _create_vocab(
                    list(_RESERVED_SYMBOLS) + ["released"], vocab_path)
                config = tf_example_utils.ClassifierConversionConfig(
                    vocab_file=vocab_path,
                    max_seq_length=32,
                    max_column_id=32,
                    max_row_id=32,
                    strip_column_names=False,
                )

            pipeline = intermediate_pretrain_utils.build_pipeline(
                mode=mode,
                config=synthesize_entablement.SynthesizationConfig(
                    prob_count_aggregation=prob_count_aggregation),
                use_fake_table=use_fake_table,
                add_opposite_table=add_opposite_table,
                drop_without_support_rate=drop_without_support_rate,
                input_file=os.path.join(self._test_dir,
                                        "pretrain_interactions.txtpb"),
                output_dir=temp_dir,
                output_suffix=".tfrecord",
                num_splits=3,
                conversion_config=config,
            )

            beam_runner.run_type(pipeline, runner_type).wait_until_finish()

            message_type = interaction_pb2.Interaction
            if add_example_conversion:
                message_type = tf.train.Example

            for name in [("train"), ("test")]:
                self.assertNotEmpty(
                    list(
                        _read_record(
                            os.path.join(temp_dir, f"{name}.tfrecord"),
                            message_type,
                        )))

            if add_example_conversion:
                self.assertNotEmpty(
                    list(
                        _read_record(
                            os.path.join(temp_dir, "interactions.tfrecord"),
                            interaction_pb2.Interaction,
                        ), ))

コード例 #2

0

ファイルを表示

ファイル: tf_example_utils_test.py プロジェクト: vinodhinir/tapas

 def test_convert(self):
     max_seq_length = 12
     with tempfile.TemporaryDirectory() as input_dir:
         vocab_file = os.path.join(input_dir, 'vocab.txt')
         _create_vocab(vocab_file, range(10))
         converter = tf_example_utils.ToClassifierTensorflowExample(
             config=tf_example_utils.ClassifierConversionConfig(
                 vocab_file=vocab_file,
                 max_seq_length=max_seq_length,
                 max_column_id=max_seq_length,
                 max_row_id=max_seq_length,
                 strip_column_names=False,
                 add_aggregation_candidates=False,
             ))
         interaction = interaction_pb2.Interaction(
             table=interaction_pb2.Table(
                 columns=[
                     interaction_pb2.Cell(text='A'),
                     interaction_pb2.Cell(text='B'),
                     interaction_pb2.Cell(text='C'),
                 ],
                 rows=[
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='0'),
                         interaction_pb2.Cell(text='4'),
                         interaction_pb2.Cell(text='5'),
                     ]),
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='1'),
                         interaction_pb2.Cell(text='3'),
                         interaction_pb2.Cell(text='5'),
                     ]),
                 ],
             ),
             questions=[
                 interaction_pb2.Question(id='id', original_text='2')
             ],
         )
         number_annotation_utils.add_numeric_values(interaction)
         example = converter.convert(interaction, 0)
         logging.info(example)
         self.assertEqual(_get_int_feature(example, 'input_ids'),
                          [2, 8, 3, 1, 1, 1, 6, 10, 11, 7, 9, 11])
         self.assertEqual(_get_int_feature(example, 'row_ids'),
                          [0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2])
         self.assertEqual(_get_int_feature(example, 'column_ids'),
                          [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3])
         self.assertEqual(_get_int_feature(example, 'column_ranks'),
                          [0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 1])
         self.assertEqual(_get_int_feature(example, 'numeric_relations'),
                          [0, 0, 0, 0, 0, 0, 4, 2, 2, 4, 2, 2])
         self.assertEqual(
             _get_float_feature(example, 'question_numeric_values'),
             _clean_nans([2.0] + [_NAN] * (_MAX_NUMERIC_VALUES - 1)))

コード例 #3

0

ファイルを表示

 def test_convert_with_context_heading(self):
     max_seq_length = 20
     with tempfile.TemporaryDirectory() as input_dir:
         vocab_file = os.path.join(input_dir, 'vocab.txt')
         _create_vocab(vocab_file, ['a', 'b', 'c', 'd', 'e'])
         converter = tf_example_utils.ToClassifierTensorflowExample(
             config=tf_example_utils.ClassifierConversionConfig(
                 vocab_file=vocab_file,
                 max_seq_length=max_seq_length,
                 max_column_id=max_seq_length,
                 max_row_id=max_seq_length,
                 strip_column_names=False,
                 add_aggregation_candidates=False,
                 use_document_title=True,
                 use_context_title=True,
                 update_answer_coordinates=True,
             ))
         interaction = interaction_pb2.Interaction(
             table=interaction_pb2.Table(
                 document_title='E E',
                 columns=[
                     interaction_pb2.Cell(text='A'),
                     interaction_pb2.Cell(text='A B C'),
                 ],
                 rows=[
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='A B'),
                         interaction_pb2.Cell(text='A B C'),
                     ]),
                 ],
                 context_heading='B',
             ),
             questions=[
                 interaction_pb2.Question(
                     id='id',
                     original_text='D',
                     answer=interaction_pb2.Answer(answer_texts=['B C']),
                 )
             ],
         )
         example = converter.convert(interaction, 0)
         logging.info(example)
         self.assertEqual(
             _get_int_feature(example, 'input_ids'),
             [2, 5, 3, 10, 10, 3, 7, 3, 6, 6, 7, 8, 6, 7, 6, 7, 8, 0, 0, 0])
         self.assertEqual(
             _get_int_feature(example, 'label_ids'),
             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])

コード例 #4

0

ファイルを表示

 def test_convert_with_trimmed_cell(self):
     max_seq_length = 16
     with tempfile.TemporaryDirectory() as input_dir:
         vocab_file = os.path.join(input_dir, 'vocab.txt')
         _create_vocab(vocab_file, range(10))
         converter = tf_example_utils.ToClassifierTensorflowExample(
             config=tf_example_utils.ClassifierConversionConfig(
                 vocab_file=vocab_file,
                 max_seq_length=max_seq_length,
                 max_column_id=max_seq_length,
                 max_row_id=max_seq_length,
                 strip_column_names=False,
                 add_aggregation_candidates=False,
                 cell_trim_length=2,
                 drop_rows_to_fit=True))
         interaction = interaction_pb2.Interaction(
             table=interaction_pb2.Table(
                 columns=[
                     interaction_pb2.Cell(text='A'),
                     interaction_pb2.Cell(text='A A'),
                     interaction_pb2.Cell(text='A A A A'),
                 ],
                 rows=[
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                     ]),
                     interaction_pb2.Cells(cells=[
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                         interaction_pb2.Cell(text='A A A'),
                     ]),
                 ],
             ),
             questions=[
                 interaction_pb2.Question(id='id', original_text='A')
             ],
         )
         number_annotation_utils.add_numeric_values(interaction)
         example = converter.convert(interaction, 0)
         logging.info(example)
         # We expect the second row to be dropped all cells should be trimmed to
         # >= 2 tokens.
         self.assertEqual(_get_int_feature(example, 'column_ids'),
                          [0, 0, 0, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 0, 0])

コード例 #5

0

ファイルを表示

ファイル: tf_example_utils_test.py プロジェクト: x0rzkov/tapas

  def test_get_empty_example(self):
    max_seq_length = 3

    with tempfile.TemporaryDirectory() as input_dir:
      vocab_file = os.path.join(input_dir, 'vocab.txt')
      _create_vocab(vocab_file, [])
      converter = tf_example_utils.ToClassifierTensorflowExample(
          config=tf_example_utils.ClassifierConversionConfig(
              vocab_file=vocab_file,
              max_seq_length=max_seq_length,
              max_column_id=max_seq_length,
              max_row_id=max_seq_length,
              strip_column_names=False,
              add_aggregation_candidates=False,
          ))
      example = converter.get_empty_example()
      logging.info(example)
      question_id = _get_byte_feature(example, 'question_id')[0].decode('utf-8')
      self.assertEqual(question_id, text_utils.get_padded_question_id())

コード例 #6

0

ファイルを表示

ファイル: classifier.py プロジェクト: mittalpatel/TAPAS-TF2

 def __init__(self,
              model_dir: Text,
              task: Text,
              tf_record_filename="test.tfrecord"):
     self.tf_record_filename = tf_record_filename
     self.task = task
     vocab_file = os.path.join(model_dir, "vocab.txt")
     bert_config_file = os.path.join(model_dir, "bert_config.json")
     classifier_conversion_config = tf_example_utils.ClassifierConversionConfig(
         vocab_file=vocab_file,
         max_seq_length=_MAX_SEQ_LENGTH,
         max_column_id=_MAX_TABLE_ID,
         max_row_id=_MAX_TABLE_ID,
         strip_column_names=False,
         add_aggregation_candidates=False)
     self.converter = tf_example_utils.ToClassifierTensorflowExample(
         classifier_conversion_config)
     self.bert_config = json.load(open(bert_config_file))
     self.tapas_config = get_config(task, self.bert_config, model_dir)
     self.tapas = get_model(self.tapas_config, _MAX_SEQ_LENGTH)

コード例 #7

0

ファイルを表示

def main(unused_argv):
    del unused_argv
    config = synthesize_entablement.SynthesizationConfig(
        prob_count_aggregation=FLAGS.prob_count_aggregation, )
    conversion_config = None
    if FLAGS.convert_to_examples:
        conversion_config = tf_example_utils.ClassifierConversionConfig(
            vocab_file=FLAGS.vocab_file,
            max_seq_length=FLAGS.max_seq_length,
            max_column_id=FLAGS.max_seq_length,
            max_row_id=FLAGS.max_seq_length,
            strip_column_names=False,
        )
    pipeline = intermediate_pretrain_utils.build_pipeline(
        mode=FLAGS.mode,
        config=config,
        use_fake_table=FLAGS.use_fake_table,
        add_opposite_table=FLAGS.add_opposite_table,
        drop_without_support_rate=FLAGS.drop_without_support_rate,
        input_file=FLAGS.input_file,
        output_dir=FLAGS.output_dir,
        output_suffix=FLAGS.output_suffix,
        conversion_config=conversion_config)
    beam_runner.run(pipeline).wait_until_finish()

コード例 #8

0

ファイルを表示

ファイル: run_task_main.py プロジェクト: vishwajeet93/tapas

def _create_examples(
    interaction_dir,
    example_dir,
    vocab_file,
    filename,
    batch_size,
    test_mode,
):
  """Creates TF example for a single dataset."""

  filename = f'{filename}.tfrecord'
  interaction_path = os.path.join(interaction_dir, filename)
  example_path = os.path.join(example_dir, filename)

  config = tf_example_utils.ClassifierConversionConfig(
      vocab_file=vocab_file,
      max_seq_length=FLAGS.max_seq_length,
      max_column_id=_MAX_TABLE_ID,
      max_row_id=_MAX_TABLE_ID,
      strip_column_names=False,
      add_aggregation_candidates=False,
  )
  converter = tf_example_utils.ToClassifierTensorflowExample(config)

  examples = []
  num_questions = 0
  num_conversion_errors = 0
  for interaction in prediction_utils.iterate_interactions(interaction_path):
    number_annotation_utils.add_numeric_values(interaction)
    for i in range(len(interaction.questions)):
      num_questions += 1

      try:
        examples.append(converter.convert(interaction, i))
      except ValueError as e:
        num_conversion_errors += 1
        logging.info("Can't convert interaction: %s error: %s", interaction.id,
                     e)
    if test_mode and len(examples) >= 100:
      break

  _print(f'Processed: {filename}')
  _print(f'Num questions processed: {num_questions}')
  _print(f'Num examples: {len(examples)}')
  _print(f'Num conversion errors: {num_conversion_errors}')

  if batch_size is None:
    random.shuffle(examples)
  else:
    # Make sure the eval sets are divisible by the test batch size since
    # otherwise examples will be dropped on TPU.
    # These examples will later be ignored when writing the predictions.
    originial_num_examples = len(examples)
    while len(examples) % batch_size != 0:
      examples.append(converter.get_empty_example())
    if originial_num_examples != len(examples):
      _print(f'Padded with {len(examples) - originial_num_examples} examples.')

  with tf.io.TFRecordWriter(
      example_path,
      options=_to_tf_compression_type(FLAGS.compression_type),
  ) as writer:
    for example in examples:
      writer.write(example.SerializeToString())

コード例 #9

0

ファイルを表示

    def test_convert_with_token_selection(self):
        max_seq_length = 12
        with tempfile.TemporaryDirectory() as input_dir:
            vocab_file = os.path.join(input_dir, 'vocab.txt')
            _create_vocab(vocab_file, range(10))
            converter = tf_example_utils.ToClassifierTensorflowExample(
                config=tf_example_utils.ClassifierConversionConfig(
                    vocab_file=vocab_file,
                    max_seq_length=max_seq_length,
                    max_column_id=max_seq_length,
                    max_row_id=max_seq_length,
                    strip_column_names=False,
                    add_aggregation_candidates=False,
                ))
            interaction = interaction_pb2.Interaction(
                table=interaction_pb2.Table(
                    columns=[
                        interaction_pb2.Cell(text='A'),
                        interaction_pb2.Cell(text='B'),
                        interaction_pb2.Cell(text='C'),
                    ],
                    rows=[
                        interaction_pb2.Cells(cells=[
                            interaction_pb2.Cell(text='0 6'),
                            interaction_pb2.Cell(text='4 7'),
                            interaction_pb2.Cell(text='5 6'),
                        ]),
                        interaction_pb2.Cells(cells=[
                            interaction_pb2.Cell(text='1 7'),
                            interaction_pb2.Cell(text='3 6'),
                            interaction_pb2.Cell(text='5 5'),
                        ]),
                    ],
                ),
                questions=[
                    interaction_pb2.Question(id='id', original_text='2')
                ],
            )
            table_coordinates = []
            for r, c, t in [(0, 0, 0), (1, 0, 0), (1, 2, 0), (2, 0, 0),
                            (2, 2, 0), (2, 2, 1)]:
                table_coordinates.append(
                    table_selection_pb2.TableSelection.TokenCoordinates(
                        row_index=r, column_index=c, token_index=t))
            interaction.questions[0].Extensions[
                table_selection_pb2.TableSelection.
                table_selection_ext].CopyFrom(
                    table_selection_pb2.TableSelection(
                        selected_tokens=table_coordinates))

            number_annotation_utils.add_numeric_values(interaction)
            example = converter.convert(interaction, 0)
            logging.info(example)
            self.assertEqual(_get_int_feature(example, 'input_ids'),
                             [2, 8, 3, 1, 6, 11, 7, 11, 11, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'row_ids'),
                             [0, 0, 0, 0, 1, 1, 2, 2, 2, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'column_ids'),
                             [0, 0, 0, 1, 1, 3, 1, 3, 3, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'column_ranks'),
                             [0, 0, 0, 0, 1, 1, 2, 1, 1, 0, 0, 0])
            self.assertEqual(_get_int_feature(example, 'numeric_relations'),
                             [0, 0, 0, 0, 4, 2, 4, 2, 2, 0, 0, 0])

コード例 #10

0

ファイルを表示

ファイル: tapas.py プロジェクト: ferrygun/tapas

from tapas.scripts import prediction_utils

os.makedirs('results/sqa/tf_examples', exist_ok=True)
os.makedirs('results/sqa/model', exist_ok=True)
with open('results/sqa/model/checkpoint', 'w') as f:
    f.write('model_checkpoint_path: "model.ckpt-0"')
for suffix in ['.data-00000-of-00001', '.index', '.meta']:
    shutil.copyfile(f'tapas_sqa_base/model.ckpt{suffix}',
                    f'results/sqa/model/model.ckpt-0{suffix}')

    max_seq_length = 512
vocab_file = "tapas_sqa_base/vocab.txt"
config = tf_example_utils.ClassifierConversionConfig(
    vocab_file=vocab_file,
    max_seq_length=max_seq_length,
    max_column_id=max_seq_length,
    max_row_id=max_seq_length,
    strip_column_names=False,
    add_aggregation_candidates=False,
)
converter = tf_example_utils.ToClassifierTensorflowExample(config)


def convert_interactions_to_examples(tables_and_queries):
    """Calls Tapas converter to convert interaction to example."""
    for idx, (table, queries) in enumerate(tables_and_queries):
        interaction = interaction_pb2.Interaction()
        for position, query in enumerate(queries):
            question = interaction.questions.add()
            question.original_text = query
            question.id = f"{idx}-0_{position}"
        for header in table[0]: