def parse_file_full_embeddings_tapas(fname, outfilename): model_name = 'google/tapas-base' tokenizer = TapasTokenizer.from_pretrained(model_name) config = TapasConfig('google-base-finetuned-wikisql-supervised') model = TapasForQuestionAnswering.from_pretrained(model_name, config=config).to(device) final_dict = {} with open(fname) as f: data = list(f) print("Num Examples: {}".format(len(data))) for i, line in enumerate(data): #print(line) result = json.loads(line) tbl_id = result['table_id'] table_string = ' '.join(_tbl(result)) table_list = table_string.split(SPLIT_WORD) table_list_filtered = [token for token in table_list if token != ''] dict_index = {key : [] for key in table_list_filtered} table = pd.DataFrame(dict_index) query = [' '.join(result['question']['words'])] inputs = tokenizer(table=table, queries=query) out = model(inputs)[0].tolist() final_dict[tbl_id] = out if i % 200 == 0: print("Num Examples Done: {}".format(i)) with open(outfilename, 'w') as outfile: json.dump(final_dict, outfile)
def load_model(): print('downloading model') model_name = 'google/tapas-base-finetuned-wtq' model = TapasForQuestionAnswering.from_pretrained(model_name) tokenizer = TapasTokenizer.from_pretrained(model_name) print('model downloaded') return model, tokenizer
def default_tokenizer(self): return TapasTokenizer.from_pretrained( "google/tapas-base-finetuned-wtq")
def korean_table_question_answering_example(): from transformers import pipeline from transformers import TapasConfig, TapasForQuestionAnswering, TapasTokenizer import pandas as pd # REF [site] >> https://github.com/monologg/KoBERT-Transformers from tokenization_kobert import KoBertTokenizer data_dict = { '배우': ['송광호', '최민식', '설경구'], '나이': ['54', '58', '53'], '출연작품수': ['38', '32', '42'], '생년월일': ['1967/02/25', '1962/05/30', '1967/05/14'], } data_df = pd.DataFrame.from_dict(data_dict) if False: # Show the data frame. from IPython.display import display, HTML display(data_df) #print(HTML(data_df.to_html()).data) query = '최민식씨의 나이는?' # REF [site] >> https://huggingface.co/monologg pretrained_model_name = 'monologg/kobert' #pretrained_model_name = 'monologg/distilkobert' if False: # Not working. table_pipeline = pipeline( 'table-question-answering', model=pretrained_model_name, tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name) ) elif False: # Not working. #config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False) #model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name, config=config) model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name) table_pipeline = pipeline( 'table-question-answering', model=model, tokenizer=KoBertTokenizer.from_pretrained(pretrained_model_name) ) else: # Not correctly working. model = TapasForQuestionAnswering.from_pretrained(pretrained_model_name) table_pipeline = pipeline( 'table-question-answering', model=model, tokenizer=TapasTokenizer.from_pretrained(pretrained_model_name) ) answer = table_pipeline(data_dict, query) #answer = table_pipeline(data_df, query) print('Answer: {}.'.format(answer))
import gradio as gr import pandas as pd from transformers import TapasTokenizer, TapasForQuestionAnswering, TapasConfig model_name = 'google/tapas-base-finetuned-wtq' model = TapasForQuestionAnswering.from_pretrained(model_name) tokenizer = TapasTokenizer.from_pretrained(model_name) df_table = pd.read_csv("df_table.csv") df_table = {c: [str(x) for x in df_table[c].tolist()] for c in df_table.columns} df_table = pd.DataFrame.from_dict(df_table) def predict(table, queries): inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="pt") outputs = model(**inputs) predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()) # let's print out the results: id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"} aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices] answers = [] for coordinates in predicted_answer_coordinates: if len(coordinates) == 1: # only a single cell:
def convert_tf_checkpoint_to_pytorch(task, reset_position_index_per_cell, tf_checkpoint_path, tapas_config_file, pytorch_dump_path): # Initialise PyTorch model. # If you want to convert a checkpoint that uses absolute position embeddings, make sure to set reset_position_index_per_cell of # TapasConfig to False. # initialize configuration from json file config = TapasConfig.from_json_file(tapas_config_file) # set absolute/relative position embeddings parameter config.reset_position_index_per_cell = reset_position_index_per_cell # set remaining parameters of TapasConfig as well as the model based on the task if task == "SQA": model = TapasForQuestionAnswering(config=config) elif task == "WTQ": # run_task_main.py hparams config.num_aggregation_labels = 4 config.use_answer_as_supervision = True # hparam_utils.py hparams config.answer_loss_cutoff = 0.664694 config.cell_selection_preference = 0.207951 config.huber_loss_delta = 0.121194 config.init_cell_selection_weights_to_zero = True config.select_one_column = True config.allow_empty_column_selection = False config.temperature = 0.0352513 model = TapasForQuestionAnswering(config=config) elif task == "WIKISQL_SUPERVISED": # run_task_main.py hparams config.num_aggregation_labels = 4 config.use_answer_as_supervision = False # hparam_utils.py hparams config.answer_loss_cutoff = 36.4519 config.cell_selection_preference = 0.903421 config.huber_loss_delta = 222.088 config.init_cell_selection_weights_to_zero = True config.select_one_column = True config.allow_empty_column_selection = True config.temperature = 0.763141 model = TapasForQuestionAnswering(config=config) elif task == "TABFACT": model = TapasForSequenceClassification(config=config) elif task == "MLM": model = TapasForMaskedLM(config=config) elif task == "INTERMEDIATE_PRETRAINING": model = TapasModel(config=config) else: raise ValueError(f"Task {task} not supported.") print(f"Building PyTorch model from configuration: {config}") # Load weights from tf checkpoint load_tf_weights_in_tapas(model, config, tf_checkpoint_path) # Save pytorch-model (weights and configuration) print(f"Save PyTorch model to {pytorch_dump_path}") model.save_pretrained(pytorch_dump_path) # Save tokenizer files print(f"Save tokenizer files to {pytorch_dump_path}") tokenizer = TapasTokenizer(vocab_file=tf_checkpoint_path[:-10] + "vocab.txt", model_max_length=512) tokenizer.save_pretrained(pytorch_dump_path) print("Used relative position embeddings:", model.config.reset_position_index_per_cell)