def parse_file_full_embeddings_tapas(fname, outfilename): model_name = 'google/tapas-base' tokenizer = TapasTokenizer.from_pretrained(model_name) config = TapasConfig('google-base-finetuned-wikisql-supervised') model = TapasForQuestionAnswering.from_pretrained(model_name, config=config).to(device) final_dict = {} with open(fname) as f: data = list(f) print("Num Examples: {}".format(len(data))) for i, line in enumerate(data): #print(line) result = json.loads(line) tbl_id = result['table_id'] table_string = ' '.join(_tbl(result)) table_list = table_string.split(SPLIT_WORD) table_list_filtered = [token for token in table_list if token != ''] dict_index = {key : [] for key in table_list_filtered} table = pd.DataFrame(dict_index) query = [' '.join(result['question']['words'])] inputs = tokenizer(table=table, queries=query) out = model(inputs)[0].tolist() final_dict[tbl_id] = out if i % 200 == 0: print("Num Examples Done: {}".format(i)) with open(outfilename, 'w') as outfile: json.dump(final_dict, outfile)
def load_model(): print('downloading model') model_name = 'google/tapas-base-finetuned-wtq' model = TapasForQuestionAnswering.from_pretrained(model_name) tokenizer = TapasTokenizer.from_pretrained(model_name) print('model downloaded') return model, tokenizer
def test_inference_question_answering_head_strong_supervision(self): # note that google/tapas-base-finetuned-wikisql-supervised should correspond to tapas_wikisql_sqa_inter_masklm_base_reset model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wikisql-supervised").to( torch_device ) tokenizer = self.default_tokenizer table, queries = prepare_tapas_single_inputs_for_inference() inputs = tokenizer(table=table, queries=queries, return_tensors="pt") inputs = {k: v.to(torch_device) for k, v in inputs.items()} outputs = model(**inputs) # test the logits logits = outputs.logits expected_shape = torch.Size((1, 21)) self.assertEqual(logits.shape, expected_shape) expected_tensor = torch.tensor( [ [ -10011.1084, -10011.1084, -10011.1084, -10011.1084, -10011.1084, -10011.1084, -10011.1084, -10011.1084, -10011.1084, -18.6185989, -10008.7969, 17.6355762, 17.6355762, 17.6355762, -10002.4404, -18.7111301, -18.7111301, -18.7111301, -18.7111301, -18.7111301, -10007.0977, ] ], device=torch_device, ) self.assertTrue(torch.allclose(logits, expected_tensor, atol=TOLERANCE)) # test the aggregation logits logits_aggregation = outputs.logits_aggregation expected_shape = torch.Size((1, 4)) self.assertEqual(logits_aggregation.shape, expected_shape) expected_tensor = torch.tensor( [[16.5659733, -3.06624889, -2.34152961, -0.970244825]], device=torch_device ) # PyTorch model outputs [[16.5679, -3.0668, -2.3442, -0.9674]] self.assertTrue(torch.allclose(logits_aggregation, expected_tensor, atol=TOLERANCE))
def test_inference_question_answering_head_weak_supervision(self): # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq").to(torch_device) tokenizer = self.default_tokenizer # let's test on a batch table, queries = prepare_tapas_batch_inputs_for_inference() inputs = tokenizer(table=table, queries=queries, padding="longest", return_tensors="pt") inputs_on_device = {k: v.to(torch_device) for k, v in inputs.items()} outputs = model(**inputs_on_device) # test the logits logits = outputs.logits expected_shape = torch.Size((2, 28)) self.assertEqual(logits.shape, expected_shape) expected_slice = torch.tensor( [ [-160.375504, -160.375504, -160.375504, -10072.3965, -10070.9414, -10094.9736], [-9861.6123, -9861.6123, -9861.6123, -9861.6123, -9891.01172, 146.600677], ], device=torch_device, ) self.assertTrue(torch.allclose(logits[:, -6:], expected_slice, atol=TOLERANCE)) # test the aggregation logits logits_aggregation = outputs.logits_aggregation expected_shape = torch.Size((2, 4)) self.assertEqual(logits_aggregation.shape, expected_shape) expected_tensor = torch.tensor( [[18.8545208, -9.76614857, -6.3128891, -2.93525243], [-4.05782509, 40.0351, -5.35329962, 23.3978653]], device=torch_device, ) self.assertTrue(torch.allclose(logits_aggregation, expected_tensor, atol=TOLERANCE)) # test the predicted answer coordinates and aggregation indices EXPECTED_PREDICTED_ANSWER_COORDINATES = [[(0, 0)], [(1, 2)]] EXPECTED_PREDICTED_AGGREGATION_INDICES = [0, 1] predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( inputs, outputs.logits.detach().cpu(), outputs.logits_aggregation.detach().cpu() ) self.assertEqual(EXPECTED_PREDICTED_ANSWER_COORDINATES, predicted_answer_coordinates) self.assertEqual(EXPECTED_PREDICTED_AGGREGATION_INDICES, predicted_aggregation_indices)
def test_inference_question_answering_head_conversational_absolute_embeddings(self): # note that google/tapas-small-finetuned-sqa should correspond to tapas_sqa_inter_masklm_small_reset # however here we test the version with absolute position embeddings model = TapasForQuestionAnswering.from_pretrained("google/tapas-small-finetuned-sqa", revision="no_reset").to( torch_device ) tokenizer = self.default_tokenizer table, queries = prepare_tapas_single_inputs_for_inference() inputs = tokenizer(table=table, queries=queries, return_tensors="pt") inputs = {k: v.to(torch_device) for k, v in inputs.items()} outputs = model(**inputs) # test the logits logits = outputs.logits expected_shape = torch.Size((1, 21)) self.assertEqual(logits.shape, expected_shape) expected_tensor = torch.tensor( [ [ -10014.7793, -10014.7793, -10014.7793, -10014.7793, -10014.7793, -10014.7793, -10014.7793, -10014.7793, -10014.7793, -18.8419304, -10018.0391, 17.7848816, 17.7848816, 17.7848816, -9981.02832, -16.4005489, -16.4005489, -16.4005489, -16.4005489, -16.4005489, -10013.4736, ] ], device=torch_device, ) self.assertTrue(torch.allclose(logits, expected_tensor, atol=TOLERANCE))
def test_inference_question_answering_head_conversational(self): # note that google/tapas-base-finetuned-sqa should correspond to tapas_sqa_inter_masklm_base_reset model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-sqa").to(torch_device) tokenizer = self.default_tokenizer table, queries = prepare_tapas_single_inputs_for_inference() inputs = tokenizer(table=table, queries=queries, return_tensors="pt") inputs = {k: v.to(torch_device) for k, v in inputs.items()} outputs = model(**inputs) # test the logits logits = outputs.logits expected_shape = torch.Size((1, 21)) self.assertEqual(logits.shape, expected_shape) expected_tensor = torch.tensor( [ [ -9997.22461, -9997.22461, -9997.22461, -9997.22461, -9997.22461, -9997.22461, -9997.22461, -9997.22461, -9997.22461, -16.2628059, -10004.082, 15.4330549, 15.4330549, 15.4330549, -9990.42, -16.3270779, -16.3270779, -16.3270779, -16.3270779, -16.3270779, -10004.8506, ] ], device=torch_device, ) self.assertTrue(torch.allclose(logits, expected_tensor, atol=TOLERANCE))
def test_training_question_answering_head_weak_supervision(self): # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset model = TapasForQuestionAnswering.from_pretrained( "google/tapas-base-finetuned-wtq").to(torch_device) model.to(torch_device) # normally we should put the model in training mode but it's a pain to do this with the TF 1 implementation tokenizer = self.default_tokenizer # let's test on a batch table, queries, answer_coordinates, answer_text, float_answer = prepare_tapas_batch_inputs_for_training( ) inputs = tokenizer( table=table, queries=queries, answer_coordinates=answer_coordinates, answer_text=answer_text, padding="longest", return_tensors="pt", ) # prepare data (created by the tokenizer) and move to torch_device input_ids = inputs["input_ids"].to(torch_device) attention_mask = inputs["attention_mask"].to(torch_device) token_type_ids = inputs["token_type_ids"].to(torch_device) labels = inputs["labels"].to(torch_device) numeric_values = inputs["numeric_values"].to(torch_device) numeric_values_scale = inputs["numeric_values_scale"].to(torch_device) # the answer should be prepared by the user float_answer = torch.FloatTensor(float_answer).to(torch_device) # forward pass to get loss + logits: outputs = model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, float_answer=float_answer, ) # test the loss loss = outputs.loss expected_loss = torch.tensor(3.3527612686157227e-08, device=torch_device) self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-6)) # test the logits on the first example logits = outputs.logits expected_shape = torch.Size((2, 29)) self.assertEqual(logits.shape, expected_shape) expected_slice = torch.tensor( [ -160.0156, -160.0156, -160.0156, -160.0156, -160.0156, -10072.2266, -10070.8896, -10092.6006, -10092.6006, ], device=torch_device, ) self.assertTrue( torch.allclose(logits[0, -9:], expected_slice, atol=1e-6)) # test the aggregation logits on the second example logits_aggregation = outputs.logits_aggregation expected_shape = torch.Size((2, 4)) self.assertEqual(logits_aggregation.shape, expected_shape) expected_slice = torch.tensor([-4.0538, 40.0304, -5.3554, 23.3965], device=torch_device) self.assertTrue( torch.allclose(logits_aggregation[1, -4:], expected_slice, atol=1e-4))
def korean_table_question_answering_example(): from transformers import pipeline from transformers import TapasConfig, TapasForQuestionAnswering, TapasTokenizer import pandas as pd # REF [site] >> https://github.com/monologg/KoBERT-Transformers from tokenization_kobert import KoBertTokenizer data_dict = { '배우': ['송광호', '최민식', '설경구'], '나이': ['54', '58', '53'], '출연작품수': ['38', '32', '42'], '생년월일': ['1967/02/25', '1962/05/30', '1967/05/14'], } data_df = pd.DataFrame.from_dict(data_dict) if False: # Show the data frame. from IPython.display import display, HTML display(data_df) #print(HTML(data_df.to_html()).data) query = '최민식씨의 나이는?' # REF [site] >> https://huggingface.co/monologg pretrained_model_name = 'monologg/kobert' #pretrained_model_name = 'monologg/distilkobert' if False: # Not working. table_pipeline = pipeline( 'table-question-answering', model=pretrained_model_name, tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name) ) elif False: # Not working. #config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False) #model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name, config=config) model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name) table_pipeline = pipeline( 'table-question-answering', model=model, tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name) ) else: # Not correctly working. model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name) table_pipeline = pipeline( 'table-question-answering', model=model, tokenizer=TapasTokenizer.from_pretrained(pretrained_model_name) ) answer = table_pipeline(data_dict, query) #answer = table_pipeline(data_df, query) print('Answer: {}.'.format(answer))
import gradio as gr import pandas as pd from transformers import TapasTokenizer, TapasForQuestionAnswering, TapasConfig model_name = 'google/tapas-base-finetuned-wtq' model = TapasForQuestionAnswering.from_pretrained(model_name) tokenizer = TapasTokenizer.from_pretrained(model_name) df_table = pd.read_csv("df_table.csv") df_table = {c: [str(x) for x in df_table[c].tolist()] for c in df_table.columns} df_table = pd.DataFrame.from_dict(df_table) def predict(table, queries): inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="pt") outputs = model(**inputs) predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()) # let's print out the results: id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"} aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices] answers = [] for coordinates in predicted_answer_coordinates: if len(coordinates) == 1: # only a single cell: