def start_bundle(self): convert_impl = ConverterImplType(self._convert_impl_value) if convert_impl == ConverterImplType.PYTHON: self._converter = tf_example_utils.ToClassifierTensorflowExample( self._config) else: raise ValueError(f'Unsupported implementation: {convert_impl.name}')
def test_convert(self): max_seq_length = 12 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, range(10)) converter = tf_example_utils.ToClassifierTensorflowExample( config=tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, )) interaction = interaction_pb2.Interaction( table=interaction_pb2.Table( columns=[ interaction_pb2.Cell(text='A'), interaction_pb2.Cell(text='B'), interaction_pb2.Cell(text='C'), ], rows=[ interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='0'), interaction_pb2.Cell(text='4'), interaction_pb2.Cell(text='5'), ]), interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='1'), interaction_pb2.Cell(text='3'), interaction_pb2.Cell(text='5'), ]), ], ), questions=[ interaction_pb2.Question(id='id', original_text='2') ], ) number_annotation_utils.add_numeric_values(interaction) example = converter.convert(interaction, 0) logging.info(example) self.assertEqual(_get_int_feature(example, 'input_ids'), [2, 8, 3, 1, 1, 1, 6, 10, 11, 7, 9, 11]) self.assertEqual(_get_int_feature(example, 'row_ids'), [0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2]) self.assertEqual(_get_int_feature(example, 'column_ids'), [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3]) self.assertEqual(_get_int_feature(example, 'column_ranks'), [0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 1]) self.assertEqual(_get_int_feature(example, 'numeric_relations'), [0, 0, 0, 0, 0, 0, 4, 2, 2, 4, 2, 2]) self.assertEqual( _get_float_feature(example, 'question_numeric_values'), _clean_nans([2.0] + [_NAN] * (_MAX_NUMERIC_VALUES - 1)))
def test_convert_with_context_heading(self): max_seq_length = 20 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, ['a', 'b', 'c', 'd', 'e']) converter = tf_example_utils.ToClassifierTensorflowExample( config=tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, use_document_title=True, use_context_title=True, update_answer_coordinates=True, )) interaction = interaction_pb2.Interaction( table=interaction_pb2.Table( document_title='E E', columns=[ interaction_pb2.Cell(text='A'), interaction_pb2.Cell(text='A B C'), ], rows=[ interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='A B'), interaction_pb2.Cell(text='A B C'), ]), ], context_heading='B', ), questions=[ interaction_pb2.Question( id='id', original_text='D', answer=interaction_pb2.Answer(answer_texts=['B C']), ) ], ) example = converter.convert(interaction, 0) logging.info(example) self.assertEqual( _get_int_feature(example, 'input_ids'), [2, 5, 3, 10, 10, 3, 7, 3, 6, 6, 7, 8, 6, 7, 6, 7, 8, 0, 0, 0]) self.assertEqual( _get_int_feature(example, 'label_ids'), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])
def test_convert_with_trimmed_cell(self): max_seq_length = 16 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, range(10)) converter = tf_example_utils.ToClassifierTensorflowExample( config=tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, cell_trim_length=2, drop_rows_to_fit=True)) interaction = interaction_pb2.Interaction( table=interaction_pb2.Table( columns=[ interaction_pb2.Cell(text='A'), interaction_pb2.Cell(text='A A'), interaction_pb2.Cell(text='A A A A'), ], rows=[ interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='A A A'), interaction_pb2.Cell(text='A A A'), interaction_pb2.Cell(text='A A A'), ]), interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='A A A'), interaction_pb2.Cell(text='A A A'), interaction_pb2.Cell(text='A A A'), ]), ], ), questions=[ interaction_pb2.Question(id='id', original_text='A') ], ) number_annotation_utils.add_numeric_values(interaction) example = converter.convert(interaction, 0) logging.info(example) # We expect the second row to be dropped all cells should be trimmed to # >= 2 tokens. self.assertEqual(_get_int_feature(example, 'column_ids'), [0, 0, 0, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 0, 0])
def test_get_empty_example(self): max_seq_length = 3 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, []) converter = tf_example_utils.ToClassifierTensorflowExample( config=tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, )) example = converter.get_empty_example() logging.info(example) question_id = _get_byte_feature(example, 'question_id')[0].decode('utf-8') self.assertEqual(question_id, text_utils.get_padded_question_id())
def __init__(self, model_dir: Text, task: Text, tf_record_filename="test.tfrecord"): self.tf_record_filename = tf_record_filename self.task = task vocab_file = os.path.join(model_dir, "vocab.txt") bert_config_file = os.path.join(model_dir, "bert_config.json") classifier_conversion_config = tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=_MAX_SEQ_LENGTH, max_column_id=_MAX_TABLE_ID, max_row_id=_MAX_TABLE_ID, strip_column_names=False, add_aggregation_candidates=False) self.converter = tf_example_utils.ToClassifierTensorflowExample( classifier_conversion_config) self.bert_config = json.load(open(bert_config_file)) self.tapas_config = get_config(task, self.bert_config, model_dir) self.tapas = get_model(self.tapas_config, _MAX_SEQ_LENGTH)
def _create_examples( interaction_dir, example_dir, vocab_file, filename, batch_size, test_mode, ): """Creates TF example for a single dataset.""" filename = f'{filename}.tfrecord' interaction_path = os.path.join(interaction_dir, filename) example_path = os.path.join(example_dir, filename) config = tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=FLAGS.max_seq_length, max_column_id=_MAX_TABLE_ID, max_row_id=_MAX_TABLE_ID, strip_column_names=False, add_aggregation_candidates=False, ) converter = tf_example_utils.ToClassifierTensorflowExample(config) examples = [] num_questions = 0 num_conversion_errors = 0 for interaction in prediction_utils.iterate_interactions(interaction_path): number_annotation_utils.add_numeric_values(interaction) for i in range(len(interaction.questions)): num_questions += 1 try: examples.append(converter.convert(interaction, i)) except ValueError as e: num_conversion_errors += 1 logging.info("Can't convert interaction: %s error: %s", interaction.id, e) if test_mode and len(examples) >= 100: break _print(f'Processed: {filename}') _print(f'Num questions processed: {num_questions}') _print(f'Num examples: {len(examples)}') _print(f'Num conversion errors: {num_conversion_errors}') if batch_size is None: random.shuffle(examples) else: # Make sure the eval sets are divisible by the test batch size since # otherwise examples will be dropped on TPU. # These examples will later be ignored when writing the predictions. originial_num_examples = len(examples) while len(examples) % batch_size != 0: examples.append(converter.get_empty_example()) if originial_num_examples != len(examples): _print(f'Padded with {len(examples) - originial_num_examples} examples.') with tf.io.TFRecordWriter( example_path, options=_to_tf_compression_type(FLAGS.compression_type), ) as writer: for example in examples: writer.write(example.SerializeToString())
def test_convert_with_token_selection(self): max_seq_length = 12 with tempfile.TemporaryDirectory() as input_dir: vocab_file = os.path.join(input_dir, 'vocab.txt') _create_vocab(vocab_file, range(10)) converter = tf_example_utils.ToClassifierTensorflowExample( config=tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, )) interaction = interaction_pb2.Interaction( table=interaction_pb2.Table( columns=[ interaction_pb2.Cell(text='A'), interaction_pb2.Cell(text='B'), interaction_pb2.Cell(text='C'), ], rows=[ interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='0 6'), interaction_pb2.Cell(text='4 7'), interaction_pb2.Cell(text='5 6'), ]), interaction_pb2.Cells(cells=[ interaction_pb2.Cell(text='1 7'), interaction_pb2.Cell(text='3 6'), interaction_pb2.Cell(text='5 5'), ]), ], ), questions=[ interaction_pb2.Question(id='id', original_text='2') ], ) table_coordinates = [] for r, c, t in [(0, 0, 0), (1, 0, 0), (1, 2, 0), (2, 0, 0), (2, 2, 0), (2, 2, 1)]: table_coordinates.append( table_selection_pb2.TableSelection.TokenCoordinates( row_index=r, column_index=c, token_index=t)) interaction.questions[0].Extensions[ table_selection_pb2.TableSelection. table_selection_ext].CopyFrom( table_selection_pb2.TableSelection( selected_tokens=table_coordinates)) number_annotation_utils.add_numeric_values(interaction) example = converter.convert(interaction, 0) logging.info(example) self.assertEqual(_get_int_feature(example, 'input_ids'), [2, 8, 3, 1, 6, 11, 7, 11, 11, 0, 0, 0]) self.assertEqual(_get_int_feature(example, 'row_ids'), [0, 0, 0, 0, 1, 1, 2, 2, 2, 0, 0, 0]) self.assertEqual(_get_int_feature(example, 'column_ids'), [0, 0, 0, 1, 1, 3, 1, 3, 3, 0, 0, 0]) self.assertEqual(_get_int_feature(example, 'column_ranks'), [0, 0, 0, 0, 1, 1, 2, 1, 1, 0, 0, 0]) self.assertEqual(_get_int_feature(example, 'numeric_relations'), [0, 0, 0, 0, 4, 2, 4, 2, 2, 0, 0, 0])
f.write('model_checkpoint_path: "model.ckpt-0"') for suffix in ['.data-00000-of-00001', '.index', '.meta']: shutil.copyfile(f'tapas_sqa_base/model.ckpt{suffix}', f'results/sqa/model/model.ckpt-0{suffix}') max_seq_length = 512 vocab_file = "tapas_sqa_base/vocab.txt" config = tf_example_utils.ClassifierConversionConfig( vocab_file=vocab_file, max_seq_length=max_seq_length, max_column_id=max_seq_length, max_row_id=max_seq_length, strip_column_names=False, add_aggregation_candidates=False, ) converter = tf_example_utils.ToClassifierTensorflowExample(config) def convert_interactions_to_examples(tables_and_queries): """Calls Tapas converter to convert interaction to example.""" for idx, (table, queries) in enumerate(tables_and_queries): interaction = interaction_pb2.Interaction() for position, query in enumerate(queries): question = interaction.questions.add() question.original_text = query question.id = f"{idx}-0_{position}" for header in table[0]: interaction.table.columns.add().text = header for line in table[1:]: row = interaction.table.rows.add() for cell in line:
def start_bundle(self): self._converter = tf_example_utils.ToClassifierTensorflowExample( self._config)