def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, tapas_config_file, pytorch_dump_path): # Initialise PyTorch model #config = TapasConfig.from_json_file(tapas_config_file) config = TapasConfig(task="SQA") print("Building PyTorch model from configuration: {}".format(str(config))) #model = TapasForMaskedLM(config) model = TapasForQuestionAnswering(config) # Load weights from tf checkpoint load_tf_weights_in_tapas(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def parse_file_full_embeddings_tapas(fname, outfilename): model_name = 'google/tapas-base' tokenizer = TapasTokenizer.from_pretrained(model_name) config = TapasConfig('google-base-finetuned-wikisql-supervised') model = TapasForQuestionAnswering.from_pretrained(model_name, config=config).to(device) final_dict = {} with open(fname) as f: data = list(f) print("Num Examples: {}".format(len(data))) for i, line in enumerate(data): #print(line) result = json.loads(line) tbl_id = result['table_id'] table_string = ' '.join(_tbl(result)) table_list = table_string.split(SPLIT_WORD) table_list_filtered = [token for token in table_list if token != ''] dict_index = {key : [] for key in table_list_filtered} table = pd.DataFrame(dict_index) query = [' '.join(result['question']['words'])] inputs = tokenizer(table=table, queries=query) out = model(inputs)[0].tolist() final_dict[tbl_id] = out if i % 200 == 0: print("Num Examples Done: {}".format(i)) with open(outfilename, 'w') as outfile: json.dump(final_dict, outfile)
def load_model(): print('downloading model') model_name = 'google/tapas-base-finetuned-wtq' model = TapasForQuestionAnswering.from_pretrained(model_name) tokenizer = TapasTokenizer.from_pretrained(model_name) print('model downloaded') return model, tokenizer
def test_inference_question_answering_head_strong_supervision(self): # note that google/tapas-base-finetuned-wikisql-supervised should correspond to tapas_wikisql_sqa_inter_masklm_base_reset model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wikisql-supervised").to( torch_device ) tokenizer = self.default_tokenizer table, queries = prepare_tapas_single_inputs_for_inference() inputs = tokenizer(table=table, queries=queries, return_tensors="pt") inputs = {k: v.to(torch_device) for k, v in inputs.items()} outputs = model(**inputs) # test the logits logits = outputs.logits expected_shape = torch.Size((1, 21)) self.assertEqual(logits.shape, expected_shape) expected_tensor = torch.tensor( [ [ -10011.1084, -10011.1084, -10011.1084, -10011.1084, -10011.1084, -10011.1084, -10011.1084, -10011.1084, -10011.1084, -18.6185989, -10008.7969, 17.6355762, 17.6355762, 17.6355762, -10002.4404, -18.7111301, -18.7111301, -18.7111301, -18.7111301, -18.7111301, -10007.0977, ] ], device=torch_device, ) self.assertTrue(torch.allclose(logits, expected_tensor, atol=TOLERANCE)) # test the aggregation logits logits_aggregation = outputs.logits_aggregation expected_shape = torch.Size((1, 4)) self.assertEqual(logits_aggregation.shape, expected_shape) expected_tensor = torch.tensor( [[16.5659733, -3.06624889, -2.34152961, -0.970244825]], device=torch_device ) # PyTorch model outputs [[16.5679, -3.0668, -2.3442, -0.9674]] self.assertTrue(torch.allclose(logits_aggregation, expected_tensor, atol=TOLERANCE))
def test_inference_question_answering_head_weak_supervision(self): # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq").to(torch_device) tokenizer = self.default_tokenizer # let's test on a batch table, queries = prepare_tapas_batch_inputs_for_inference() inputs = tokenizer(table=table, queries=queries, padding="longest", return_tensors="pt") inputs_on_device = {k: v.to(torch_device) for k, v in inputs.items()} outputs = model(**inputs_on_device) # test the logits logits = outputs.logits expected_shape = torch.Size((2, 28)) self.assertEqual(logits.shape, expected_shape) expected_slice = torch.tensor( [ [-160.375504, -160.375504, -160.375504, -10072.3965, -10070.9414, -10094.9736], [-9861.6123, -9861.6123, -9861.6123, -9861.6123, -9891.01172, 146.600677], ], device=torch_device, ) self.assertTrue(torch.allclose(logits[:, -6:], expected_slice, atol=TOLERANCE)) # test the aggregation logits logits_aggregation = outputs.logits_aggregation expected_shape = torch.Size((2, 4)) self.assertEqual(logits_aggregation.shape, expected_shape) expected_tensor = torch.tensor( [[18.8545208, -9.76614857, -6.3128891, -2.93525243], [-4.05782509, 40.0351, -5.35329962, 23.3978653]], device=torch_device, ) self.assertTrue(torch.allclose(logits_aggregation, expected_tensor, atol=TOLERANCE)) # test the predicted answer coordinates and aggregation indices EXPECTED_PREDICTED_ANSWER_COORDINATES = [[(0, 0)], [(1, 2)]] EXPECTED_PREDICTED_AGGREGATION_INDICES = [0, 1] predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( inputs, outputs.logits.detach().cpu(), outputs.logits_aggregation.detach().cpu() ) self.assertEqual(EXPECTED_PREDICTED_ANSWER_COORDINATES, predicted_answer_coordinates) self.assertEqual(EXPECTED_PREDICTED_AGGREGATION_INDICES, predicted_aggregation_indices)
def test_inference_question_answering_head_conversational_absolute_embeddings(self): # note that google/tapas-small-finetuned-sqa should correspond to tapas_sqa_inter_masklm_small_reset # however here we test the version with absolute position embeddings model = TapasForQuestionAnswering.from_pretrained("google/tapas-small-finetuned-sqa", revision="no_reset").to( torch_device ) tokenizer = self.default_tokenizer table, queries = prepare_tapas_single_inputs_for_inference() inputs = tokenizer(table=table, queries=queries, return_tensors="pt") inputs = {k: v.to(torch_device) for k, v in inputs.items()} outputs = model(**inputs) # test the logits logits = outputs.logits expected_shape = torch.Size((1, 21)) self.assertEqual(logits.shape, expected_shape) expected_tensor = torch.tensor( [ [ -10014.7793, -10014.7793, -10014.7793, -10014.7793, -10014.7793, -10014.7793, -10014.7793, -10014.7793, -10014.7793, -18.8419304, -10018.0391, 17.7848816, 17.7848816, 17.7848816, -9981.02832, -16.4005489, -16.4005489, -16.4005489, -16.4005489, -16.4005489, -10013.4736, ] ], device=torch_device, ) self.assertTrue(torch.allclose(logits, expected_tensor, atol=TOLERANCE))
def test_inference_question_answering_head_conversational(self): # note that google/tapas-base-finetuned-sqa should correspond to tapas_sqa_inter_masklm_base_reset model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-sqa").to(torch_device) tokenizer = self.default_tokenizer table, queries = prepare_tapas_single_inputs_for_inference() inputs = tokenizer(table=table, queries=queries, return_tensors="pt") inputs = {k: v.to(torch_device) for k, v in inputs.items()} outputs = model(**inputs) # test the logits logits = outputs.logits expected_shape = torch.Size((1, 21)) self.assertEqual(logits.shape, expected_shape) expected_tensor = torch.tensor( [ [ -9997.22461, -9997.22461, -9997.22461, -9997.22461, -9997.22461, -9997.22461, -9997.22461, -9997.22461, -9997.22461, -16.2628059, -10004.082, 15.4330549, 15.4330549, 15.4330549, -9990.42, -16.3270779, -16.3270779, -16.3270779, -16.3270779, -16.3270779, -10004.8506, ] ], device=torch_device, ) self.assertTrue(torch.allclose(logits, expected_tensor, atol=TOLERANCE))
def test_training_question_answering_head_weak_supervision(self): # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset model = TapasForQuestionAnswering.from_pretrained( "google/tapas-base-finetuned-wtq").to(torch_device) model.to(torch_device) # normally we should put the model in training mode but it's a pain to do this with the TF 1 implementation tokenizer = self.default_tokenizer # let's test on a batch table, queries, answer_coordinates, answer_text, float_answer = prepare_tapas_batch_inputs_for_training( ) inputs = tokenizer( table=table, queries=queries, answer_coordinates=answer_coordinates, answer_text=answer_text, padding="longest", return_tensors="pt", ) # prepare data (created by the tokenizer) and move to torch_device input_ids = inputs["input_ids"].to(torch_device) attention_mask = inputs["attention_mask"].to(torch_device) token_type_ids = inputs["token_type_ids"].to(torch_device) labels = inputs["labels"].to(torch_device) numeric_values = inputs["numeric_values"].to(torch_device) numeric_values_scale = inputs["numeric_values_scale"].to(torch_device) # the answer should be prepared by the user float_answer = torch.FloatTensor(float_answer).to(torch_device) # forward pass to get loss + logits: outputs = model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, float_answer=float_answer, ) # test the loss loss = outputs.loss expected_loss = torch.tensor(3.3527612686157227e-08, device=torch_device) self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-6)) # test the logits on the first example logits = outputs.logits expected_shape = torch.Size((2, 29)) self.assertEqual(logits.shape, expected_shape) expected_slice = torch.tensor( [ -160.0156, -160.0156, -160.0156, -160.0156, -160.0156, -10072.2266, -10070.8896, -10092.6006, -10092.6006, ], device=torch_device, ) self.assertTrue( torch.allclose(logits[0, -9:], expected_slice, atol=1e-6)) # test the aggregation logits on the second example logits_aggregation = outputs.logits_aggregation expected_shape = torch.Size((2, 4)) self.assertEqual(logits_aggregation.shape, expected_shape) expected_slice = torch.tensor([-4.0538, 40.0304, -5.3554, 23.3965], device=torch_device) self.assertTrue( torch.allclose(logits_aggregation[1, -4:], expected_slice, atol=1e-4))
def create_and_check_for_question_answering( self, config, input_ids, input_mask, token_type_ids, sequence_labels, token_labels, labels, numeric_values, numeric_values_scale, float_answer, aggregation_labels, ): # inference: without aggregation head (SQA). Model only returns logits sqa_config = copy.copy(config) sqa_config.num_aggregation_labels = 0 sqa_config.use_answer_as_supervision = False model = TapasForQuestionAnswering(config=sqa_config) model.to(torch_device) model.eval() result = model( input_ids=input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, ) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) # inference: with aggregation head (WTQ, WikiSQL-supervised). Model returns logits and aggregation logits model = TapasForQuestionAnswering(config=config) model.to(torch_device) model.eval() result = model( input_ids=input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, ) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels)) # training: can happen in 3 main ways # case 1: conversational (SQA) model = TapasForQuestionAnswering(config=sqa_config) model.to(torch_device) model.eval() result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=labels, ) self.parent.assertEqual(result.loss.shape, ()) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) # case 2: weak supervision for aggregation (WTQ) model = TapasForQuestionAnswering(config=config) model.to(torch_device) model.eval() result = model( input_ids=input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, float_answer=float_answer, ) self.parent.assertEqual(result.loss.shape, ()) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels)) # case 3: strong supervision for aggregation (WikiSQL-supervised) wikisql_config = copy.copy(config) wikisql_config.use_answer_as_supervision = False model = TapasForQuestionAnswering(config=wikisql_config) model.to(torch_device) model.eval() result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=labels, aggregation_labels=aggregation_labels, ) self.parent.assertEqual(result.loss.shape, ()) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels))
def korean_table_question_answering_example(): from transformers import pipeline from transformers import TapasConfig, TapasForQuestionAnswering, TapasTokenizer import pandas as pd # REF [site] >> https://github.com/monologg/KoBERT-Transformers from tokenization_kobert import KoBertTokenizer data_dict = { '배우': ['송광호', '최민식', '설경구'], '나이': ['54', '58', '53'], '출연작품수': ['38', '32', '42'], '생년월일': ['1967/02/25', '1962/05/30', '1967/05/14'], } data_df = pd.DataFrame.from_dict(data_dict) if False: # Show the data frame. from IPython.display import display, HTML display(data_df) #print(HTML(data_df.to_html()).data) query = '최민식씨의 나이는?' # REF [site] >> https://huggingface.co/monologg pretrained_model_name = 'monologg/kobert' #pretrained_model_name = 'monologg/distilkobert' if False: # Not working. table_pipeline = pipeline( 'table-question-answering', model=pretrained_model_name, tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name) ) elif False: # Not working. #config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False) #model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name, config=config) model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name) table_pipeline = pipeline( 'table-question-answering', model=model, tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name) ) else: # Not correctly working. model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name) table_pipeline = pipeline( 'table-question-answering', model=model, tokenizer=TapasTokenizer.from_pretrained(pretrained_model_name) ) answer = table_pipeline(data_dict, query) #answer = table_pipeline(data_df, query) print('Answer: {}.'.format(answer))
import gradio as gr import pandas as pd from transformers import TapasTokenizer, TapasForQuestionAnswering, TapasConfig model_name = 'google/tapas-base-finetuned-wtq' model = TapasForQuestionAnswering.from_pretrained(model_name) tokenizer = TapasTokenizer.from_pretrained(model_name) df_table = pd.read_csv("df_table.csv") df_table = {c: [str(x) for x in df_table[c].tolist()] for c in df_table.columns} df_table = pd.DataFrame.from_dict(df_table) def predict(table, queries): inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="pt") outputs = model(**inputs) predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()) # let's print out the results: id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"} aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices] answers = [] for coordinates in predicted_answer_coordinates: if len(coordinates) == 1: # only a single cell:
def convert_tf_checkpoint_to_pytorch(task, reset_position_index_per_cell, tf_checkpoint_path, tapas_config_file, pytorch_dump_path): # Initialise PyTorch model. # If you want to convert a checkpoint that uses absolute position embeddings, make sure to set reset_position_index_per_cell of # TapasConfig to False. # initialize configuration from json file config = TapasConfig.from_json_file(tapas_config_file) # set absolute/relative position embeddings parameter config.reset_position_index_per_cell = reset_position_index_per_cell # set remaining parameters of TapasConfig as well as the model based on the task if task == "SQA": model = TapasForQuestionAnswering(config=config) elif task == "WTQ": # run_task_main.py hparams config.num_aggregation_labels = 4 config.use_answer_as_supervision = True # hparam_utils.py hparams config.answer_loss_cutoff = 0.664694 config.cell_selection_preference = 0.207951 config.huber_loss_delta = 0.121194 config.init_cell_selection_weights_to_zero = True config.select_one_column = True config.allow_empty_column_selection = False config.temperature = 0.0352513 model = TapasForQuestionAnswering(config=config) elif task == "WIKISQL_SUPERVISED": # run_task_main.py hparams config.num_aggregation_labels = 4 config.use_answer_as_supervision = False # hparam_utils.py hparams config.answer_loss_cutoff = 36.4519 config.cell_selection_preference = 0.903421 config.huber_loss_delta = 222.088 config.init_cell_selection_weights_to_zero = True config.select_one_column = True config.allow_empty_column_selection = True config.temperature = 0.763141 model = TapasForQuestionAnswering(config=config) elif task == "TABFACT": model = TapasForSequenceClassification(config=config) elif task == "MLM": model = TapasForMaskedLM(config=config) elif task == "INTERMEDIATE_PRETRAINING": model = TapasModel(config=config) else: raise ValueError(f"Task {task} not supported.") print(f"Building PyTorch model from configuration: {config}") # Load weights from tf checkpoint load_tf_weights_in_tapas(model, config, tf_checkpoint_path) # Save pytorch-model (weights and configuration) print(f"Save PyTorch model to {pytorch_dump_path}") model.save_pretrained(pytorch_dump_path) # Save tokenizer files print(f"Save tokenizer files to {pytorch_dump_path}") tokenizer = TapasTokenizer(vocab_file=tf_checkpoint_path[:-10] + "vocab.txt", model_max_length=512) tokenizer.save_pretrained(pytorch_dump_path) print("Used relative position embeddings:", model.config.reset_position_index_per_cell)