def parse_file_full_embeddings_tapas(fname, outfilename):
    model_name = 'google/tapas-base'
    tokenizer = TapasTokenizer.from_pretrained(model_name)
    config = TapasConfig('google-base-finetuned-wikisql-supervised')
    model = TapasForQuestionAnswering.from_pretrained(model_name, config=config).to(device)

    final_dict = {}
    with open(fname) as f:
        data = list(f)

    print("Num Examples: {}".format(len(data)))
    for i, line in enumerate(data):
        #print(line)
        result = json.loads(line)
        tbl_id = result['table_id']
        table_string = ' '.join(_tbl(result))
        table_list = table_string.split(SPLIT_WORD)
        table_list_filtered = [token for token in table_list if token != '']
        dict_index = {key : [] for key in table_list_filtered}
        table = pd.DataFrame(dict_index)

        query = [' '.join(result['question']['words'])]
        inputs = tokenizer(table=table, queries=query)
        out = model(inputs)[0].tolist()
        final_dict[tbl_id] = out

        if i % 200 == 0:
            print("Num Examples Done: {}".format(i))

    with open(outfilename, 'w') as outfile:
        json.dump(final_dict, outfile)
Beispiel #2
0
def load_model():
    print('downloading model')
    model_name = 'google/tapas-base-finetuned-wtq'
    model = TapasForQuestionAnswering.from_pretrained(model_name)
    tokenizer = TapasTokenizer.from_pretrained(model_name)
    print('model downloaded')
    return model, tokenizer
Beispiel #3
0
    def test_inference_question_answering_head_strong_supervision(self):
        # note that google/tapas-base-finetuned-wikisql-supervised should correspond to tapas_wikisql_sqa_inter_masklm_base_reset
        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wikisql-supervised").to(
            torch_device
        )

        tokenizer = self.default_tokenizer
        table, queries = prepare_tapas_single_inputs_for_inference()
        inputs = tokenizer(table=table, queries=queries, return_tensors="pt")
        inputs = {k: v.to(torch_device) for k, v in inputs.items()}
        outputs = model(**inputs)
        # test the logits
        logits = outputs.logits
        expected_shape = torch.Size((1, 21))
        self.assertEqual(logits.shape, expected_shape)
        expected_tensor = torch.tensor(
            [
                [
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -18.6185989,
                    -10008.7969,
                    17.6355762,
                    17.6355762,
                    17.6355762,
                    -10002.4404,
                    -18.7111301,
                    -18.7111301,
                    -18.7111301,
                    -18.7111301,
                    -18.7111301,
                    -10007.0977,
                ]
            ],
            device=torch_device,
        )

        self.assertTrue(torch.allclose(logits, expected_tensor, atol=TOLERANCE))

        # test the aggregation logits
        logits_aggregation = outputs.logits_aggregation
        expected_shape = torch.Size((1, 4))
        self.assertEqual(logits_aggregation.shape, expected_shape)
        expected_tensor = torch.tensor(
            [[16.5659733, -3.06624889, -2.34152961, -0.970244825]], device=torch_device
        )  # PyTorch model outputs [[16.5679, -3.0668, -2.3442, -0.9674]]

        self.assertTrue(torch.allclose(logits_aggregation, expected_tensor, atol=TOLERANCE))
Beispiel #4
0
    def test_inference_question_answering_head_weak_supervision(self):
        # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset
        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq").to(torch_device)

        tokenizer = self.default_tokenizer
        # let's test on a batch
        table, queries = prepare_tapas_batch_inputs_for_inference()
        inputs = tokenizer(table=table, queries=queries, padding="longest", return_tensors="pt")
        inputs_on_device = {k: v.to(torch_device) for k, v in inputs.items()}

        outputs = model(**inputs_on_device)
        # test the logits
        logits = outputs.logits
        expected_shape = torch.Size((2, 28))
        self.assertEqual(logits.shape, expected_shape)

        expected_slice = torch.tensor(
            [
                [-160.375504, -160.375504, -160.375504, -10072.3965, -10070.9414, -10094.9736],
                [-9861.6123, -9861.6123, -9861.6123, -9861.6123, -9891.01172, 146.600677],
            ],
            device=torch_device,
        )

        self.assertTrue(torch.allclose(logits[:, -6:], expected_slice, atol=TOLERANCE))

        # test the aggregation logits
        logits_aggregation = outputs.logits_aggregation
        expected_shape = torch.Size((2, 4))
        self.assertEqual(logits_aggregation.shape, expected_shape)
        expected_tensor = torch.tensor(
            [[18.8545208, -9.76614857, -6.3128891, -2.93525243], [-4.05782509, 40.0351, -5.35329962, 23.3978653]],
            device=torch_device,
        )

        self.assertTrue(torch.allclose(logits_aggregation, expected_tensor, atol=TOLERANCE))

        # test the predicted answer coordinates and aggregation indices
        EXPECTED_PREDICTED_ANSWER_COORDINATES = [[(0, 0)], [(1, 2)]]
        EXPECTED_PREDICTED_AGGREGATION_INDICES = [0, 1]

        predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
            inputs, outputs.logits.detach().cpu(), outputs.logits_aggregation.detach().cpu()
        )

        self.assertEqual(EXPECTED_PREDICTED_ANSWER_COORDINATES, predicted_answer_coordinates)
        self.assertEqual(EXPECTED_PREDICTED_AGGREGATION_INDICES, predicted_aggregation_indices)
Beispiel #5
0
    def test_inference_question_answering_head_conversational_absolute_embeddings(self):
        # note that google/tapas-small-finetuned-sqa should correspond to tapas_sqa_inter_masklm_small_reset
        # however here we test the version with absolute position embeddings
        model = TapasForQuestionAnswering.from_pretrained("google/tapas-small-finetuned-sqa", revision="no_reset").to(
            torch_device
        )

        tokenizer = self.default_tokenizer
        table, queries = prepare_tapas_single_inputs_for_inference()
        inputs = tokenizer(table=table, queries=queries, return_tensors="pt")
        inputs = {k: v.to(torch_device) for k, v in inputs.items()}
        outputs = model(**inputs)
        # test the logits
        logits = outputs.logits
        expected_shape = torch.Size((1, 21))
        self.assertEqual(logits.shape, expected_shape)

        expected_tensor = torch.tensor(
            [
                [
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -18.8419304,
                    -10018.0391,
                    17.7848816,
                    17.7848816,
                    17.7848816,
                    -9981.02832,
                    -16.4005489,
                    -16.4005489,
                    -16.4005489,
                    -16.4005489,
                    -16.4005489,
                    -10013.4736,
                ]
            ],
            device=torch_device,
        )

        self.assertTrue(torch.allclose(logits, expected_tensor, atol=TOLERANCE))
Beispiel #6
0
    def test_inference_question_answering_head_conversational(self):
        # note that google/tapas-base-finetuned-sqa should correspond to tapas_sqa_inter_masklm_base_reset
        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-sqa").to(torch_device)

        tokenizer = self.default_tokenizer
        table, queries = prepare_tapas_single_inputs_for_inference()
        inputs = tokenizer(table=table, queries=queries, return_tensors="pt")
        inputs = {k: v.to(torch_device) for k, v in inputs.items()}
        outputs = model(**inputs)
        # test the logits
        logits = outputs.logits
        expected_shape = torch.Size((1, 21))
        self.assertEqual(logits.shape, expected_shape)

        expected_tensor = torch.tensor(
            [
                [
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -16.2628059,
                    -10004.082,
                    15.4330549,
                    15.4330549,
                    15.4330549,
                    -9990.42,
                    -16.3270779,
                    -16.3270779,
                    -16.3270779,
                    -16.3270779,
                    -16.3270779,
                    -10004.8506,
                ]
            ],
            device=torch_device,
        )

        self.assertTrue(torch.allclose(logits, expected_tensor, atol=TOLERANCE))
Beispiel #7
0
    def test_training_question_answering_head_weak_supervision(self):
        # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset
        model = TapasForQuestionAnswering.from_pretrained(
            "google/tapas-base-finetuned-wtq").to(torch_device)
        model.to(torch_device)
        # normally we should put the model in training mode but it's a pain to do this with the TF 1 implementation

        tokenizer = self.default_tokenizer
        # let's test on a batch
        table, queries, answer_coordinates, answer_text, float_answer = prepare_tapas_batch_inputs_for_training(
        )
        inputs = tokenizer(
            table=table,
            queries=queries,
            answer_coordinates=answer_coordinates,
            answer_text=answer_text,
            padding="longest",
            return_tensors="pt",
        )

        # prepare data (created by the tokenizer) and move to torch_device
        input_ids = inputs["input_ids"].to(torch_device)
        attention_mask = inputs["attention_mask"].to(torch_device)
        token_type_ids = inputs["token_type_ids"].to(torch_device)
        labels = inputs["labels"].to(torch_device)
        numeric_values = inputs["numeric_values"].to(torch_device)
        numeric_values_scale = inputs["numeric_values_scale"].to(torch_device)

        # the answer should be prepared by the user
        float_answer = torch.FloatTensor(float_answer).to(torch_device)

        # forward pass to get loss + logits:
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels,
            numeric_values=numeric_values,
            numeric_values_scale=numeric_values_scale,
            float_answer=float_answer,
        )

        # test the loss
        loss = outputs.loss
        expected_loss = torch.tensor(3.3527612686157227e-08,
                                     device=torch_device)
        self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-6))

        # test the logits on the first example
        logits = outputs.logits
        expected_shape = torch.Size((2, 29))
        self.assertEqual(logits.shape, expected_shape)
        expected_slice = torch.tensor(
            [
                -160.0156,
                -160.0156,
                -160.0156,
                -160.0156,
                -160.0156,
                -10072.2266,
                -10070.8896,
                -10092.6006,
                -10092.6006,
            ],
            device=torch_device,
        )

        self.assertTrue(
            torch.allclose(logits[0, -9:], expected_slice, atol=1e-6))

        # test the aggregation logits on the second example
        logits_aggregation = outputs.logits_aggregation
        expected_shape = torch.Size((2, 4))
        self.assertEqual(logits_aggregation.shape, expected_shape)
        expected_slice = torch.tensor([-4.0538, 40.0304, -5.3554, 23.3965],
                                      device=torch_device)

        self.assertTrue(
            torch.allclose(logits_aggregation[1, -4:],
                           expected_slice,
                           atol=1e-4))
Beispiel #8
0
def korean_table_question_answering_example():
	from transformers import pipeline
	from transformers import TapasConfig, TapasForQuestionAnswering, TapasTokenizer
	import pandas as pd
	# REF [site] >> https://github.com/monologg/KoBERT-Transformers
	from tokenization_kobert import KoBertTokenizer

	data_dict = {
		'배우': ['송광호', '최민식', '설경구'],
		'나이': ['54', '58', '53'],
		'출연작품수': ['38', '32', '42'],
		'생년월일': ['1967/02/25', '1962/05/30', '1967/05/14'],
	}
	data_df = pd.DataFrame.from_dict(data_dict)

	if False:
		# Show the data frame.
		from IPython.display import display, HTML
		display(data_df)
		#print(HTML(data_df.to_html()).data)

	query = '최민식씨의 나이는?'

	# REF [site] >> https://huggingface.co/monologg
	pretrained_model_name = 'monologg/kobert'
	#pretrained_model_name = 'monologg/distilkobert'

	if False:
		# Not working.

		table_pipeline = pipeline(
			'table-question-answering',
			model=pretrained_model_name,
			tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name)
		)
	elif False:
		# Not working.

		#config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False)
		#model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name, config=config)
		model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name)

		table_pipeline = pipeline(
			'table-question-answering',
			model=model,
			tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name)
		)
	else:
		# Not correctly working.

		model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name)

		table_pipeline = pipeline(
			'table-question-answering',
			model=model,
			tokenizer=TapasTokenizer.from_pretrained(pretrained_model_name)
		)

	answer = table_pipeline(data_dict, query)
	#answer = table_pipeline(data_df, query)
	print('Answer: {}.'.format(answer))
import gradio as gr
import pandas as pd
from transformers import TapasTokenizer, TapasForQuestionAnswering, TapasConfig

model_name = 'google/tapas-base-finetuned-wtq'
model = TapasForQuestionAnswering.from_pretrained(model_name)
tokenizer = TapasTokenizer.from_pretrained(model_name)

df_table = pd.read_csv("df_table.csv")
df_table = {c: [str(x) for x in df_table[c].tolist()]
            for c in df_table.columns}
df_table = pd.DataFrame.from_dict(df_table)


def predict(table, queries):
    inputs = tokenizer(table=table, queries=queries,
                       padding='max_length', return_tensors="pt")
    outputs = model(**inputs)
    predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
        inputs,
        outputs.logits.detach(),
        outputs.logits_aggregation.detach())

    # let's print out the results:
    id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
    aggregation_predictions_string = [id2aggregation[x]
                                      for x in predicted_aggregation_indices]
    answers = []
    for coordinates in predicted_answer_coordinates:
        if len(coordinates) == 1:
            # only a single cell: