Ejemplo n.º 1
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, tapas_config_file, pytorch_dump_path):
    # Initialise PyTorch model
    #config = TapasConfig.from_json_file(tapas_config_file)
    config = TapasConfig(task="SQA")
    print("Building PyTorch model from configuration: {}".format(str(config)))
    #model = TapasForMaskedLM(config)
    model = TapasForQuestionAnswering(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_tapas(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
def parse_file_full_embeddings_tapas(fname, outfilename):
    model_name = 'google/tapas-base'
    tokenizer = TapasTokenizer.from_pretrained(model_name)
    config = TapasConfig('google-base-finetuned-wikisql-supervised')
    model = TapasForQuestionAnswering.from_pretrained(model_name, config=config).to(device)

    final_dict = {}
    with open(fname) as f:
        data = list(f)

    print("Num Examples: {}".format(len(data)))
    for i, line in enumerate(data):
        #print(line)
        result = json.loads(line)
        tbl_id = result['table_id']
        table_string = ' '.join(_tbl(result))
        table_list = table_string.split(SPLIT_WORD)
        table_list_filtered = [token for token in table_list if token != '']
        dict_index = {key : [] for key in table_list_filtered}
        table = pd.DataFrame(dict_index)

        query = [' '.join(result['question']['words'])]
        inputs = tokenizer(table=table, queries=query)
        out = model(inputs)[0].tolist()
        final_dict[tbl_id] = out

        if i % 200 == 0:
            print("Num Examples Done: {}".format(i))

    with open(outfilename, 'w') as outfile:
        json.dump(final_dict, outfile)
Ejemplo n.º 3
0
def load_model():
    print('downloading model')
    model_name = 'google/tapas-base-finetuned-wtq'
    model = TapasForQuestionAnswering.from_pretrained(model_name)
    tokenizer = TapasTokenizer.from_pretrained(model_name)
    print('model downloaded')
    return model, tokenizer
Ejemplo n.º 4
0
    def test_inference_question_answering_head_strong_supervision(self):
        # note that google/tapas-base-finetuned-wikisql-supervised should correspond to tapas_wikisql_sqa_inter_masklm_base_reset
        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wikisql-supervised").to(
            torch_device
        )

        tokenizer = self.default_tokenizer
        table, queries = prepare_tapas_single_inputs_for_inference()
        inputs = tokenizer(table=table, queries=queries, return_tensors="pt")
        inputs = {k: v.to(torch_device) for k, v in inputs.items()}
        outputs = model(**inputs)
        # test the logits
        logits = outputs.logits
        expected_shape = torch.Size((1, 21))
        self.assertEqual(logits.shape, expected_shape)
        expected_tensor = torch.tensor(
            [
                [
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -10011.1084,
                    -18.6185989,
                    -10008.7969,
                    17.6355762,
                    17.6355762,
                    17.6355762,
                    -10002.4404,
                    -18.7111301,
                    -18.7111301,
                    -18.7111301,
                    -18.7111301,
                    -18.7111301,
                    -10007.0977,
                ]
            ],
            device=torch_device,
        )

        self.assertTrue(torch.allclose(logits, expected_tensor, atol=TOLERANCE))

        # test the aggregation logits
        logits_aggregation = outputs.logits_aggregation
        expected_shape = torch.Size((1, 4))
        self.assertEqual(logits_aggregation.shape, expected_shape)
        expected_tensor = torch.tensor(
            [[16.5659733, -3.06624889, -2.34152961, -0.970244825]], device=torch_device
        )  # PyTorch model outputs [[16.5679, -3.0668, -2.3442, -0.9674]]

        self.assertTrue(torch.allclose(logits_aggregation, expected_tensor, atol=TOLERANCE))
Ejemplo n.º 5
0
    def test_inference_question_answering_head_weak_supervision(self):
        # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset
        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq").to(torch_device)

        tokenizer = self.default_tokenizer
        # let's test on a batch
        table, queries = prepare_tapas_batch_inputs_for_inference()
        inputs = tokenizer(table=table, queries=queries, padding="longest", return_tensors="pt")
        inputs_on_device = {k: v.to(torch_device) for k, v in inputs.items()}

        outputs = model(**inputs_on_device)
        # test the logits
        logits = outputs.logits
        expected_shape = torch.Size((2, 28))
        self.assertEqual(logits.shape, expected_shape)

        expected_slice = torch.tensor(
            [
                [-160.375504, -160.375504, -160.375504, -10072.3965, -10070.9414, -10094.9736],
                [-9861.6123, -9861.6123, -9861.6123, -9861.6123, -9891.01172, 146.600677],
            ],
            device=torch_device,
        )

        self.assertTrue(torch.allclose(logits[:, -6:], expected_slice, atol=TOLERANCE))

        # test the aggregation logits
        logits_aggregation = outputs.logits_aggregation
        expected_shape = torch.Size((2, 4))
        self.assertEqual(logits_aggregation.shape, expected_shape)
        expected_tensor = torch.tensor(
            [[18.8545208, -9.76614857, -6.3128891, -2.93525243], [-4.05782509, 40.0351, -5.35329962, 23.3978653]],
            device=torch_device,
        )

        self.assertTrue(torch.allclose(logits_aggregation, expected_tensor, atol=TOLERANCE))

        # test the predicted answer coordinates and aggregation indices
        EXPECTED_PREDICTED_ANSWER_COORDINATES = [[(0, 0)], [(1, 2)]]
        EXPECTED_PREDICTED_AGGREGATION_INDICES = [0, 1]

        predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
            inputs, outputs.logits.detach().cpu(), outputs.logits_aggregation.detach().cpu()
        )

        self.assertEqual(EXPECTED_PREDICTED_ANSWER_COORDINATES, predicted_answer_coordinates)
        self.assertEqual(EXPECTED_PREDICTED_AGGREGATION_INDICES, predicted_aggregation_indices)
Ejemplo n.º 6
0
    def test_inference_question_answering_head_conversational_absolute_embeddings(self):
        # note that google/tapas-small-finetuned-sqa should correspond to tapas_sqa_inter_masklm_small_reset
        # however here we test the version with absolute position embeddings
        model = TapasForQuestionAnswering.from_pretrained("google/tapas-small-finetuned-sqa", revision="no_reset").to(
            torch_device
        )

        tokenizer = self.default_tokenizer
        table, queries = prepare_tapas_single_inputs_for_inference()
        inputs = tokenizer(table=table, queries=queries, return_tensors="pt")
        inputs = {k: v.to(torch_device) for k, v in inputs.items()}
        outputs = model(**inputs)
        # test the logits
        logits = outputs.logits
        expected_shape = torch.Size((1, 21))
        self.assertEqual(logits.shape, expected_shape)

        expected_tensor = torch.tensor(
            [
                [
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -10014.7793,
                    -18.8419304,
                    -10018.0391,
                    17.7848816,
                    17.7848816,
                    17.7848816,
                    -9981.02832,
                    -16.4005489,
                    -16.4005489,
                    -16.4005489,
                    -16.4005489,
                    -16.4005489,
                    -10013.4736,
                ]
            ],
            device=torch_device,
        )

        self.assertTrue(torch.allclose(logits, expected_tensor, atol=TOLERANCE))
Ejemplo n.º 7
0
    def test_inference_question_answering_head_conversational(self):
        # note that google/tapas-base-finetuned-sqa should correspond to tapas_sqa_inter_masklm_base_reset
        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-sqa").to(torch_device)

        tokenizer = self.default_tokenizer
        table, queries = prepare_tapas_single_inputs_for_inference()
        inputs = tokenizer(table=table, queries=queries, return_tensors="pt")
        inputs = {k: v.to(torch_device) for k, v in inputs.items()}
        outputs = model(**inputs)
        # test the logits
        logits = outputs.logits
        expected_shape = torch.Size((1, 21))
        self.assertEqual(logits.shape, expected_shape)

        expected_tensor = torch.tensor(
            [
                [
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -9997.22461,
                    -16.2628059,
                    -10004.082,
                    15.4330549,
                    15.4330549,
                    15.4330549,
                    -9990.42,
                    -16.3270779,
                    -16.3270779,
                    -16.3270779,
                    -16.3270779,
                    -16.3270779,
                    -10004.8506,
                ]
            ],
            device=torch_device,
        )

        self.assertTrue(torch.allclose(logits, expected_tensor, atol=TOLERANCE))
Ejemplo n.º 8
0
    def test_training_question_answering_head_weak_supervision(self):
        # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset
        model = TapasForQuestionAnswering.from_pretrained(
            "google/tapas-base-finetuned-wtq").to(torch_device)
        model.to(torch_device)
        # normally we should put the model in training mode but it's a pain to do this with the TF 1 implementation

        tokenizer = self.default_tokenizer
        # let's test on a batch
        table, queries, answer_coordinates, answer_text, float_answer = prepare_tapas_batch_inputs_for_training(
        )
        inputs = tokenizer(
            table=table,
            queries=queries,
            answer_coordinates=answer_coordinates,
            answer_text=answer_text,
            padding="longest",
            return_tensors="pt",
        )

        # prepare data (created by the tokenizer) and move to torch_device
        input_ids = inputs["input_ids"].to(torch_device)
        attention_mask = inputs["attention_mask"].to(torch_device)
        token_type_ids = inputs["token_type_ids"].to(torch_device)
        labels = inputs["labels"].to(torch_device)
        numeric_values = inputs["numeric_values"].to(torch_device)
        numeric_values_scale = inputs["numeric_values_scale"].to(torch_device)

        # the answer should be prepared by the user
        float_answer = torch.FloatTensor(float_answer).to(torch_device)

        # forward pass to get loss + logits:
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels,
            numeric_values=numeric_values,
            numeric_values_scale=numeric_values_scale,
            float_answer=float_answer,
        )

        # test the loss
        loss = outputs.loss
        expected_loss = torch.tensor(3.3527612686157227e-08,
                                     device=torch_device)
        self.assertTrue(torch.allclose(loss, expected_loss, atol=1e-6))

        # test the logits on the first example
        logits = outputs.logits
        expected_shape = torch.Size((2, 29))
        self.assertEqual(logits.shape, expected_shape)
        expected_slice = torch.tensor(
            [
                -160.0156,
                -160.0156,
                -160.0156,
                -160.0156,
                -160.0156,
                -10072.2266,
                -10070.8896,
                -10092.6006,
                -10092.6006,
            ],
            device=torch_device,
        )

        self.assertTrue(
            torch.allclose(logits[0, -9:], expected_slice, atol=1e-6))

        # test the aggregation logits on the second example
        logits_aggregation = outputs.logits_aggregation
        expected_shape = torch.Size((2, 4))
        self.assertEqual(logits_aggregation.shape, expected_shape)
        expected_slice = torch.tensor([-4.0538, 40.0304, -5.3554, 23.3965],
                                      device=torch_device)

        self.assertTrue(
            torch.allclose(logits_aggregation[1, -4:],
                           expected_slice,
                           atol=1e-4))
Ejemplo n.º 9
0
    def create_and_check_for_question_answering(
        self,
        config,
        input_ids,
        input_mask,
        token_type_ids,
        sequence_labels,
        token_labels,
        labels,
        numeric_values,
        numeric_values_scale,
        float_answer,
        aggregation_labels,
    ):
        # inference: without aggregation head (SQA). Model only returns logits
        sqa_config = copy.copy(config)
        sqa_config.num_aggregation_labels = 0
        sqa_config.use_answer_as_supervision = False
        model = TapasForQuestionAnswering(config=sqa_config)
        model.to(torch_device)
        model.eval()
        result = model(
            input_ids=input_ids,
            attention_mask=input_mask,
            token_type_ids=token_type_ids,
        )
        self.parent.assertEqual(result.logits.shape,
                                (self.batch_size, self.seq_length))

        # inference: with aggregation head (WTQ, WikiSQL-supervised). Model returns logits and aggregation logits
        model = TapasForQuestionAnswering(config=config)
        model.to(torch_device)
        model.eval()
        result = model(
            input_ids=input_ids,
            attention_mask=input_mask,
            token_type_ids=token_type_ids,
        )
        self.parent.assertEqual(result.logits.shape,
                                (self.batch_size, self.seq_length))
        self.parent.assertEqual(result.logits_aggregation.shape,
                                (self.batch_size, self.num_aggregation_labels))

        # training: can happen in 3 main ways
        # case 1: conversational (SQA)
        model = TapasForQuestionAnswering(config=sqa_config)
        model.to(torch_device)
        model.eval()
        result = model(
            input_ids,
            attention_mask=input_mask,
            token_type_ids=token_type_ids,
            labels=labels,
        )
        self.parent.assertEqual(result.loss.shape, ())
        self.parent.assertEqual(result.logits.shape,
                                (self.batch_size, self.seq_length))

        # case 2: weak supervision for aggregation (WTQ)
        model = TapasForQuestionAnswering(config=config)
        model.to(torch_device)
        model.eval()
        result = model(
            input_ids=input_ids,
            attention_mask=input_mask,
            token_type_ids=token_type_ids,
            labels=labels,
            numeric_values=numeric_values,
            numeric_values_scale=numeric_values_scale,
            float_answer=float_answer,
        )
        self.parent.assertEqual(result.loss.shape, ())
        self.parent.assertEqual(result.logits.shape,
                                (self.batch_size, self.seq_length))
        self.parent.assertEqual(result.logits_aggregation.shape,
                                (self.batch_size, self.num_aggregation_labels))

        # case 3: strong supervision for aggregation (WikiSQL-supervised)
        wikisql_config = copy.copy(config)
        wikisql_config.use_answer_as_supervision = False
        model = TapasForQuestionAnswering(config=wikisql_config)
        model.to(torch_device)
        model.eval()
        result = model(
            input_ids,
            attention_mask=input_mask,
            token_type_ids=token_type_ids,
            labels=labels,
            aggregation_labels=aggregation_labels,
        )
        self.parent.assertEqual(result.loss.shape, ())
        self.parent.assertEqual(result.logits.shape,
                                (self.batch_size, self.seq_length))
        self.parent.assertEqual(result.logits_aggregation.shape,
                                (self.batch_size, self.num_aggregation_labels))
Ejemplo n.º 10
0
def korean_table_question_answering_example():
	from transformers import pipeline
	from transformers import TapasConfig, TapasForQuestionAnswering, TapasTokenizer
	import pandas as pd
	# REF [site] >> https://github.com/monologg/KoBERT-Transformers
	from tokenization_kobert import KoBertTokenizer

	data_dict = {
		'배우': ['송광호', '최민식', '설경구'],
		'나이': ['54', '58', '53'],
		'출연작품수': ['38', '32', '42'],
		'생년월일': ['1967/02/25', '1962/05/30', '1967/05/14'],
	}
	data_df = pd.DataFrame.from_dict(data_dict)

	if False:
		# Show the data frame.
		from IPython.display import display, HTML
		display(data_df)
		#print(HTML(data_df.to_html()).data)

	query = '최민식씨의 나이는?'

	# REF [site] >> https://huggingface.co/monologg
	pretrained_model_name = 'monologg/kobert'
	#pretrained_model_name = 'monologg/distilkobert'

	if False:
		# Not working.

		table_pipeline = pipeline(
			'table-question-answering',
			model=pretrained_model_name,
			tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name)
		)
	elif False:
		# Not working.

		#config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False)
		#model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name, config=config)
		model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name)

		table_pipeline = pipeline(
			'table-question-answering',
			model=model,
			tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name)
		)
	else:
		# Not correctly working.

		model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name)

		table_pipeline = pipeline(
			'table-question-answering',
			model=model,
			tokenizer=TapasTokenizer.from_pretrained(pretrained_model_name)
		)

	answer = table_pipeline(data_dict, query)
	#answer = table_pipeline(data_df, query)
	print('Answer: {}.'.format(answer))
Ejemplo n.º 11
0
import gradio as gr
import pandas as pd
from transformers import TapasTokenizer, TapasForQuestionAnswering, TapasConfig

model_name = 'google/tapas-base-finetuned-wtq'
model = TapasForQuestionAnswering.from_pretrained(model_name)
tokenizer = TapasTokenizer.from_pretrained(model_name)

df_table = pd.read_csv("df_table.csv")
df_table = {c: [str(x) for x in df_table[c].tolist()]
            for c in df_table.columns}
df_table = pd.DataFrame.from_dict(df_table)


def predict(table, queries):
    inputs = tokenizer(table=table, queries=queries,
                       padding='max_length', return_tensors="pt")
    outputs = model(**inputs)
    predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
        inputs,
        outputs.logits.detach(),
        outputs.logits_aggregation.detach())

    # let's print out the results:
    id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
    aggregation_predictions_string = [id2aggregation[x]
                                      for x in predicted_aggregation_indices]
    answers = []
    for coordinates in predicted_answer_coordinates:
        if len(coordinates) == 1:
            # only a single cell:
Ejemplo n.º 12
0
def convert_tf_checkpoint_to_pytorch(task, reset_position_index_per_cell,
                                     tf_checkpoint_path, tapas_config_file,
                                     pytorch_dump_path):
    # Initialise PyTorch model.
    # If you want to convert a checkpoint that uses absolute position embeddings, make sure to set reset_position_index_per_cell of
    # TapasConfig to False.

    # initialize configuration from json file
    config = TapasConfig.from_json_file(tapas_config_file)
    # set absolute/relative position embeddings parameter
    config.reset_position_index_per_cell = reset_position_index_per_cell

    # set remaining parameters of TapasConfig as well as the model based on the task
    if task == "SQA":
        model = TapasForQuestionAnswering(config=config)
    elif task == "WTQ":
        # run_task_main.py hparams
        config.num_aggregation_labels = 4
        config.use_answer_as_supervision = True
        # hparam_utils.py hparams
        config.answer_loss_cutoff = 0.664694
        config.cell_selection_preference = 0.207951
        config.huber_loss_delta = 0.121194
        config.init_cell_selection_weights_to_zero = True
        config.select_one_column = True
        config.allow_empty_column_selection = False
        config.temperature = 0.0352513

        model = TapasForQuestionAnswering(config=config)
    elif task == "WIKISQL_SUPERVISED":
        # run_task_main.py hparams
        config.num_aggregation_labels = 4
        config.use_answer_as_supervision = False
        # hparam_utils.py hparams
        config.answer_loss_cutoff = 36.4519
        config.cell_selection_preference = 0.903421
        config.huber_loss_delta = 222.088
        config.init_cell_selection_weights_to_zero = True
        config.select_one_column = True
        config.allow_empty_column_selection = True
        config.temperature = 0.763141

        model = TapasForQuestionAnswering(config=config)
    elif task == "TABFACT":
        model = TapasForSequenceClassification(config=config)
    elif task == "MLM":
        model = TapasForMaskedLM(config=config)
    elif task == "INTERMEDIATE_PRETRAINING":
        model = TapasModel(config=config)
    else:
        raise ValueError(f"Task {task} not supported.")

    print(f"Building PyTorch model from configuration: {config}")
    # Load weights from tf checkpoint
    load_tf_weights_in_tapas(model, config, tf_checkpoint_path)

    # Save pytorch-model (weights and configuration)
    print(f"Save PyTorch model to {pytorch_dump_path}")
    model.save_pretrained(pytorch_dump_path)

    # Save tokenizer files
    print(f"Save tokenizer files to {pytorch_dump_path}")
    tokenizer = TapasTokenizer(vocab_file=tf_checkpoint_path[:-10] +
                               "vocab.txt",
                               model_max_length=512)
    tokenizer.save_pretrained(pytorch_dump_path)

    print("Used relative position embeddings:",
          model.config.reset_position_index_per_cell)