def test_end_to_end(self, runner_type, add_example_conversion): mode = intermediate_pretrain_utils.Mode.ALL prob_count_aggregation = 0.2 use_fake_table = False add_opposite_table = False drop_without_support_rate = 0.0 with tempfile.TemporaryDirectory() as temp_dir: config = None if add_example_conversion: vocab_path = os.path.join(temp_dir, "vocab.txt") _create_vocab( list(_RESERVED_SYMBOLS) + ["released"], vocab_path) config = tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_path, max_seq_length=32, max_column_id=32, max_row_id=32, strip_column_names=False, ) pipeline = intermediate_pretrain_utils.build_pipeline( mode=mode, config=synthesize_entablement.SynthesizationConfig( prob_count_aggregation=prob_count_aggregation), use_fake_table=use_fake_table, add_opposite_table=add_opposite_table, drop_without_support_rate=drop_without_support_rate, input_file=os.path.join(self._test_dir, "pretrain_interactions.txtpb"), output_dir=temp_dir, output_suffix=".tfrecord", num_splits=3, conversion_config=config, ) beam_runner.run_type(pipeline, runner_type).wait_until_finish() message_type = interaction_pb2.Interaction if add_example_conversion: message_type = tf.train.Example for name in [("train"), ("test")]: self.assertNotEmpty( list( _read_record( os.path.join(temp_dir, f"{name}.tfrecord"), message_type, ))) if add_example_conversion: self.assertNotEmpty( list( _read_record( os.path.join(temp_dir, "interactions.tfrecord"), interaction_pb2.Interaction, ), ))
def test_convert(self): max_seq_length = 12 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, range(10)) converter = tf_example_utils.ToClassifierTensorflowExample( config=tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, )) interaction = interaction_pb2.Interaction( table=interaction_pb2.Table( columns=[ interaction_pb2.Cell(text='A'), interaction_pb2.Cell(text='B'), interaction_pb2.Cell(text='C'), ], rows=[ interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='0'), interaction_pb2.Cell(text='4'), interaction_pb2.Cell(text='5'), ]), interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='1'), interaction_pb2.Cell(text='3'), interaction_pb2.Cell(text='5'), ]), ], ), questions=[ interaction_pb2.Question(id='id', original_text='2') ], ) number_annotation_utils.add_numeric_values(interaction) example = converter.convert(interaction, 0) logging.info(example) self.assertEqual(_get_int_feature(example, 'input_ids'), [2, 8, 3, 1, 1, 1, 6, 10, 11, 7, 9, 11]) self.assertEqual(_get_int_feature(example, 'row_ids'), [0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2]) self.assertEqual(_get_int_feature(example, 'column_ids'), [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3]) self.assertEqual(_get_int_feature(example, 'column_ranks'), [0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 1]) self.assertEqual(_get_int_feature(example, 'numeric_relations'), [0, 0, 0, 0, 0, 0, 4, 2, 2, 4, 2, 2]) self.assertEqual( _get_float_feature(example, 'question_numeric_values'), _clean_nans([2.0] + [_NAN] * (_MAX_NUMERIC_VALUES - 1)))
def test_convert_with_context_heading(self): max_seq_length = 20 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, ['a', 'b', 'c', 'd', 'e']) converter = tf_example_utils.ToClassifierTensorflowExample( config=tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, use_document_title=True, use_context_title=True, update_answer_coordinates=True, )) interaction = interaction_pb2.Interaction( table=interaction_pb2.Table( document_title='E E', columns=[ interaction_pb2.Cell(text='A'), interaction_pb2.Cell(text='A B C'), ], rows=[ interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='A B'), interaction_pb2.Cell(text='A B C'), ]), ], context_heading='B', ), questions=[ interaction_pb2.Question( id='id', original_text='D', answer=interaction_pb2.Answer(answer_texts=['B C']), ) ], ) example = converter.convert(interaction, 0) logging.info(example) self.assertEqual( _get_int_feature(example, 'input_ids'), [2, 5, 3, 10, 10, 3, 7, 3, 6, 6, 7, 8, 6, 7, 6, 7, 8, 0, 0, 0]) self.assertEqual( _get_int_feature(example, 'label_ids'), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])
def test_convert_with_trimmed_cell(self): max_seq_length = 16 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, range(10)) converter = tf_example_utils.ToClassifierTensorflowExample( config=tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, cell_trim_length=2, drop_rows_to_fit=True)) interaction = interaction_pb2.Interaction( table=interaction_pb2.Table( columns=[ interaction_pb2.Cell(text='A'), interaction_pb2.Cell(text='A A'), interaction_pb2.Cell(text='A A A A'), ], rows=[ interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='A A A'), interaction_pb2.Cell(text='A A A'), interaction_pb2.Cell(text='A A A'), ]), interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='A A A'), interaction_pb2.Cell(text='A A A'), interaction_pb2.Cell(text='A A A'), ]), ], ), questions=[ interaction_pb2.Question(id='id', original_text='A') ], ) number_annotation_utils.add_numeric_values(interaction) example = converter.convert(interaction, 0) logging.info(example) # We expect the second row to be dropped all cells should be trimmed to # >= 2 tokens. self.assertEqual(_get_int_feature(example, 'column_ids'), [0, 0, 0, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 0, 0])
def test_get_empty_example(self): max_seq_length = 3 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, []) converter = tf_example_utils.ToClassifierTensorflowExample( config=tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, )) example = converter.get_empty_example() logging.info(example) question_id = _get_byte_feature(example, 'question_id')[0].decode('utf-8') self.assertEqual(question_id, text_utils.get_padded_question_id())
def __init__(self, model_dir: Text, task: Text, tf_record_filename="test.tfrecord"): self.tf_record_filename = tf_record_filename self.task = task vocab_file = os.path.join(model_dir, "vocab.txt") bert_config_file = os.path.join(model_dir, "bert_config.json") classifier_conversion_config = tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=_MAX_SEQ_LENGTH, max_column_id=_MAX_TABLE_ID, max_row_id=_MAX_TABLE_ID, strip_column_names=False, add_aggregation_candidates=False) self.converter = tf_example_utils.ToClassifierTensorflowExample( classifier_conversion_config) self.bert_config = json.load(open(bert_config_file)) self.tapas_config = get_config(task, self.bert_config, model_dir) self.tapas = get_model(self.tapas_config, _MAX_SEQ_LENGTH)
def main(unused_argv): del unused_argv config = synthesize_entablement.SynthesizationConfig( prob_count_aggregation=FLAGS.prob_count_aggregation, ) conversion_config = None if FLAGS.convert_to_examples: conversion_config = tf_example_utils.ClassifierConversionConfig( vocab_file=FLAGS.vocab_file, max_seq_length=FLAGS.max_seq_length, max_column_id=FLAGS.max_seq_length, max_row_id=FLAGS.max_seq_length, strip_column_names=False, ) pipeline = intermediate_pretrain_utils.build_pipeline( mode=FLAGS.mode, config=config, use_fake_table=FLAGS.use_fake_table, add_opposite_table=FLAGS.add_opposite_table, drop_without_support_rate=FLAGS.drop_without_support_rate, input_file=FLAGS.input_file, output_dir=FLAGS.output_dir, output_suffix=FLAGS.output_suffix, conversion_config=conversion_config) beam_runner.run(pipeline).wait_until_finish()
def _create_examples( interaction_dir, example_dir, vocab_file, filename, batch_size, test_mode, ): """Creates TF example for a single dataset.""" filename = f'{filename}.tfrecord' interaction_path = os.path.join(interaction_dir, filename) example_path = os.path.join(example_dir, filename) config = tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=FLAGS.max_seq_length, max_column_id=_MAX_TABLE_ID, max_row_id=_MAX_TABLE_ID, strip_column_names=False, add_aggregation_candidates=False, ) converter = tf_example_utils.ToClassifierTensorflowExample(config) examples = [] num_questions = 0 num_conversion_errors = 0 for interaction in prediction_utils.iterate_interactions(interaction_path): number_annotation_utils.add_numeric_values(interaction) for i in range(len(interaction.questions)): num_questions += 1 try: examples.append(converter.convert(interaction, i)) except ValueError as e: num_conversion_errors += 1 logging.info("Can't convert interaction: %s error: %s", interaction.id, e) if test_mode and len(examples) >= 100: break _print(f'Processed: {filename}') _print(f'Num questions processed: {num_questions}') _print(f'Num examples: {len(examples)}') _print(f'Num conversion errors: {num_conversion_errors}') if batch_size is None: random.shuffle(examples) else: # Make sure the eval sets are divisible by the test batch size since # otherwise examples will be dropped on TPU. # These examples will later be ignored when writing the predictions. originial_num_examples = len(examples) while len(examples) % batch_size != 0: examples.append(converter.get_empty_example()) if originial_num_examples != len(examples): _print(f'Padded with {len(examples) - originial_num_examples} examples.') with tf.io.TFRecordWriter( example_path, options=_to_tf_compression_type(FLAGS.compression_type), ) as writer: for example in examples: writer.write(example.SerializeToString())
def test_convert_with_token_selection(self): max_seq_length = 12 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, range(10)) converter = tf_example_utils.ToClassifierTensorflowExample( config=tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, )) interaction = interaction_pb2.Interaction( table=interaction_pb2.Table( columns=[ interaction_pb2.Cell(text='A'), interaction_pb2.Cell(text='B'), interaction_pb2.Cell(text='C'), ], rows=[ interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='0 6'), interaction_pb2.Cell(text='4 7'), interaction_pb2.Cell(text='5 6'), ]), interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='1 7'), interaction_pb2.Cell(text='3 6'), interaction_pb2.Cell(text='5 5'), ]), ], ), questions=[ interaction_pb2.Question(id='id', original_text='2') ], ) table_coordinates = [] for r, c, t in [(0, 0, 0), (1, 0, 0), (1, 2, 0), (2, 0, 0), (2, 2, 0), (2, 2, 1)]: table_coordinates.append( table_selection_pb2.TableSelection.TokenCoordinates( row_index=r, column_index=c, token_index=t)) interaction.questions[0].Extensions[ table_selection_pb2.TableSelection. table_selection_ext].CopyFrom( table_selection_pb2.TableSelection( selected_tokens=table_coordinates)) number_annotation_utils.add_numeric_values(interaction) example = converter.convert(interaction, 0) logging.info(example) self.assertEqual(_get_int_feature(example, 'input_ids'), [2, 8, 3, 1, 6, 11, 7, 11, 11, 0, 0, 0]) self.assertEqual(_get_int_feature(example, 'row_ids'), [0, 0, 0, 0, 1, 1, 2, 2, 2, 0, 0, 0]) self.assertEqual(_get_int_feature(example, 'column_ids'), [0, 0, 0, 1, 1, 3, 1, 3, 3, 0, 0, 0]) self.assertEqual(_get_int_feature(example, 'column_ranks'), [0, 0, 0, 0, 1, 1, 2, 1, 1, 0, 0, 0]) self.assertEqual(_get_int_feature(example, 'numeric_relations'), [0, 0, 0, 0, 4, 2, 4, 2, 2, 0, 0, 0])
from tapas.scripts import prediction_utils os.makedirs('results/sqa/tf_examples', exist_ok=True) os.makedirs('results/sqa/model', exist_ok=True) with open('results/sqa/model/checkpoint', 'w') as f: f.write('model_checkpoint_path: "model.ckpt-0"') for suffix in ['.data-00000-of-00001', '.index', '.meta']: shutil.copyfile(f'tapas_sqa_base/model.ckpt{suffix}', f'results/sqa/model/model.ckpt-0{suffix}') max_seq_length = 512 vocab_file = "tapas_sqa_base/vocab.txt" config = tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, ) converter = tf_example_utils.ToClassifierTensorflowExample(config) def convert_interactions_to_examples(tables_and_queries): """Calls Tapas converter to convert interaction to example.""" for idx, (table, queries) in enumerate(tables_and_queries): interaction = interaction_pb2.Interaction() for position, query in enumerate(queries): question = interaction.questions.add() question.original_text = query question.id = f"{idx}-0_{position}" for header in table[0]: